blob: 1dbd6fadb796fd93f5e466094fc70b17c8b8a50e [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
--
-- This test is not performed in those platforms which don't support
-- ORDERED AGGREGATES.
---------------------------------------------------------------------------
m4_include(`SQLCommon.m4')
m4_changequote(<!,!>)
m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!
-- Regex table
CREATE TABLE train_regex(pattern text,name text);
INSERT INTO train_regex VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'),
('^.+s$','endsWithS'), ('^.+ing$','endsWithIng'),
('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
analyze train_regex;
-- Label table
CREATE TABLE crf_label (id integer,label character varying);
INSERT INTO crf_label VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze crf_label;
CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);
INSERT INTO train_segmenttbl VALUES
(0,1,'confidence',11,36), (1,1,'in',5,36), (2,1,'the',2,36), (3,1,'pound',11,36),
(4,1,'is',31,36), (5,1,'widely',19,36), (6,1,'expected',29,36), (7,1,'to',24,36),
(8,1,'take',26,36), (9,1,'another',2,36), (10,1,'sharp',6,36), (11,1,'dive',11,36),
(12,1,'if',5,36), (13,1,'trade',11,36), (14,1,'figures',12,36), (15,1,'for',5,36),
(16,1,'september',13,36), (17,1,',',42,36), (18,1,'due',6,36), (19,1,'for',5,36),
(20,1,'release',11,36), (21,1,'tomorrow',11,36), (22,1,',',42,36), (23,1,'fail',26,36),
(24,1,'to',24,36), (25,1,'show',26,36), (26,1,'a',2,36), (27,1,'substantial',6,36),
(28,1,'improvement',11,36),(29,1,'from',5,36), (30,1,'july',13,36), (31,1,'and',0,36),
(32,1,'august',13,36), (33,1,'''s',16,36), (34,1,'near-record',6,36),(35,1,'deficits',12,36),
(36,1,'.',43,36), (0,2,'chancellor',13,26),(1,2,'of',5,26), (2,2,'the',2,26),
(3,2,'exchequer',13,26), (4,2,'nigel',13,26), (5,2,'lawson',13,26), (6,2,'''s',16,26),
(7,2,'restated',29,26), (8,2,'commitment',11,26),(9,2,'to',24,26), (10,2,'a',2,26),
(11,2,'firm',11,26), (12,2,'monetary',6,26), (13,2,'policy',11,26), (14,2,'has',31,26),
(15,2,'helped',29,26), (16,2,'to',24,26), (17,2,'prevent',26,26), (18,2,'a',2,26),
(19,2,'freefall',11,26), (20,2,'in',5,26), (21,2,'sterling',11,26), (22,2,'over',5,26),
(23,2,'the',2,26), (24,2,'past',6,26), (25,2,'week',11,26), (26,2,'.',43,26);
SELECT crf_train_fgen('train_segmenttbl', 'train_regex', 'crf_label', 'train_dictionary', 'train_featuretbl','train_featureset');
SELECT lincrf_train('train_featuretbl', 'train_featureset', 'crf_label', 'train_stats', 'train_crf_feature', 20);
-- Expected feature table
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_crf_feature(id integer,name text,prev_label integer,label integer,weight float);
INSERT INTO expected_crf_feature VALUES
(0,'U',-1,6,2.03722251114241), (1,'E.',2,11,2.74617537223778),
(2,'W_exchequer',-1,13,1.82177698489332), (3,'W_is',-1,31,1.802385230083),
(4,'E.',11,31,2.46925702503352), (5,'W_in',-1,5,3.252359364881),
(6,'E.',11,12,1.30565415536303), (7,'U',-1,2,-0.385936767525155),
(8,'E.',31,29,1.95816439028514), (9,'U',-1,29,1.42284032323052),
(10,'R_endsWithIng',-1,11,1.06107711527577),(11,'W_of',-1,5,3.65295433311112),
(12,'S.',-1,13,1.82987757221008), (13,'E.',24,26,3.28278206865014),
(14,'W_helped',-1,29,1.21452412817747), (15,'E.',11,24,1.5567743891308),
(16,'W_freefall',-1,11,2.39212599502655), (17,'W_chancellor',-1,13,4.07117751351007),
(18,'R_endsWithly',-1,19,1.80826054380731), (19,'R_endsWithS',-1,31,2.53099843051424),
(20,'E.',11,42,2.28570181337111), (21,'W_to',-1,24,5.09639332107322),
(22,'W_''s',-1,16,2.7288337760029), (23,'E.',5,13,2.48962334080407),
(24,'R_endsWithS',-1,12,3.55652271260539), (25,'W_from',-1,5,2.80284597986732),
(26,'S.',-1,11,0.141368455521256), (27,'W_confidence',-1,11,3.26245533963349),
(28,'W_dive',-1,11,1.29592338493044), (29,'E.',6,11,3.32147229704132),
(30,'E.',26,24,1.87855269498337), (31,'W_commitment',-1,11,1.96406781710126),
(32,'W_trade',-1,11,2.10065153867223), (33,'E.',42,26,3.01594067651091),
(34,'E.',13,13,2.03515687113842), (35,'E.',5,11,3.06727298680823),
(36,'E.',0,13,2.32674448027563), (37,'U',-1,26,1.98387304732704),
(38,'E.',6,5,1.92223183930121), (39,'E.',29,24,1.80599049616213),
(40,'W_tomorrow',-1,11,3.34106414300749), (41,'E.',11,6,1.03508232843802),
(42,'R_endsWithly',-1,13,1.53563812818195), (43,'W_figures',-1,12,2.78002901173385),
(44,'W_week',-1,11,1.88508053467186), (45,'W_restated',-1,29,1.58503375875973),
(46,'W_due',-1,6,3.39258895715363), (47,'W_august',-1,13,1.34455487966969),
(48,'W_take',-1,26,1.44523808187943), (49,'W_monetary',-1,6,4.05870827285358),
(50,'E.',2,6,3.11382961918855), (51,'W_improvement',-1,11,1.40352707737839),
(52,'W_past',-1,6,2.16725372894534), (53,'W_for',-1,5,3.98832124009076),
(54,'E.',13,5,1.55918826214718), (55,'E.',26,2,3.34688050245804),
(56,'R_endsWithER',-1,2,1.89184024107101), (57,'E.',13,0,2.40815194836302),
(58,'E.',6,12,1.9178899227945), (59,'W_nigel',-1,13,3.67154306653685),
(60,'R_endsWithS',-1,16,2.52266546833986), (61,'W_september',-1,13,2.47229643365359),
(62,'W_over',-1,5,1.72285539674115), (63,'E.',16,29,1.86715666934797),
(64,'W_if',-1,5,3.21948182538101), (65,'E.',19,29,2.11563252627385),
(66,'E.',16,6,1.43031410102084), (67,'W_the',-1,2,3.93840509018036),
(68,'U',-1,5,0.928194516517659), (69,'W_pound',-1,11,1.73917338002567),
(70,'W_firm',-1,11,2.65768224731759), (71,'W_lawson',-1,13,1.73698153439168),
(72,'W_expected',-1,29,0.9794597248659), (73,'W_show',-1,26,1.27116786883383),
(74,'W_prevent',-1,26,1.37958745070806), (75,'End.',-1,43,3.31987423296349),
(76,'W_substantial',-1,6,2.14611716740351), (77,'E.',5,2,3.4024258508775),
(78,'E.',24,2,3.1742588055445), (79,'W_sterling',-1,11,1.06107711527577),
(80,'U',-1,13,2.20779920646185), (81,'E.',13,42,2.23657409199101),
(82,'W_widely',-1,19,1.86114513803379), (83,'W_another',-1,2,2.0317791611429),
(84,'W_fail',-1,26,2.67978550279999), (85,'U',-1,0,1.13363354450703),
(86,'E.',11,5,3.00584804510012), (87,'W_deficits',-1,12,2.747158855043),
(88,'R_endsWithER',-1,13,2.76776898649185), (89,'E.',31,19,2.63372257558732),
(90,'U',-1,19,1.10718052473034), (91,'R_isDashSeparatedWords',-1,6,1.85284586706307),
(92,'W_sharp',-1,6,2.21218045077608), (93,'E.',11,43,1.46745796387309),
(94,'R_endsWithED',-1,29,3.77901761180312), (95,'W_policy',-1,11,0.862039501946008),
(96,'W_near-record',-1,6,1.85284586706307), (97,'U',-1,31,1.28095963969112),
(98,'U',-1,12,0.972489859004352), (99,'E.',13,16,3.37480330473993),
(100,'E.',29,11,2.1852156927695), (101,'W_has',-1,31,2.19831004450907),
(102,'R_endsWithER',-1,5,1.42946480149656), (103,'U',-1,11,3.03749222923604),
(104,'E.',12,43,2.24487997642474), (105,'W_july',-1,13,2.03893082794191),
(106,'E.',12,5,2.20042084572669), (107,'E.',2,13,1.87183716602929),
(108,'W_,',-1,42,4.35067339378978), (109,'E.',11,11,0.363824439333992),
(110,'W_a',-1,2,3.43362694716039), (111,'W_release',-1,11,3.76470309834533),
(112,'W_and',-1,0,4.16183350730113), (113,'W_.',-1,43,3.31987423296349),
(114,'E.',42,6,2.79544657270948);
SELECT assert(
SUM(abs(c1.weight-c2.weight)) < 0.1,
'Total difference between extracted feature weights and expected feature weights is > 0.1.')
FROM expected_crf_feature c1, train_crf_feature c2
WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
-- Compare the expected features and the extraction features. It fails
-- if the features do not match.
SELECT assert(s1.count+s2.count = 0, 'Features extracted do not match expected features.')
FROM (
SELECT count(*) FROM(
SELECT name, prev_label, label
FROM expected_crf_feature
EXCEPT ALL
SELECT name, prev_label_id, label_id
FROM train_crf_feature
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT name, prev_label_id, label_id
FROM train_crf_feature
EXCEPT ALL
SELECT name, prev_label, label
FROM expected_crf_feature
) AS U
)s2;
!>)
m4_changequote(<!`!>,<!'!>)