blob: 5d1f4bc403362f6c4a4515eed951dc85ea4f9b5c [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
-- Documents table
CREATE TABLE crf_document (id integer,text text);
INSERT INTO crf_document VALUES
(1,E'Chancellor of the Exchequer Nigel Lawson ''s restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week .'),
(2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'),
(3,E'His actions prevent disaster.');
analyze crf_document;
-- Features table
CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test VALUES
(7,'E.2',2,13,0.8957754882780667), (79,'W_by',-1,5,2.4007337770109434), (41,'U',-1,26,1.1971775735896324),
(81,'E.11',11,16,2.5835310903807915), (55,'E.11',11,43,1.7591747767619539), (39,'E.29',29,24,0.6423980041639374),
(27,'E.2',2,11,4.021547537882094), (46,'W_in',-1,5,4.561697530405777), (53,'W_past',-1,6,2.425102532944761),
(25,'E.24',24,2,2.6245019400744196), (47,'E.5',5,11,2.475011699159048), (33,'W_policy',-1,11,3.279737748615338),
(4,'W_of',-1,5,2.874451743664169), (83,'W_failure',-1,11,1.6214155026731378), (71,'R_endsWithIng',-1,28,1.944154984738804),
(2,'E.13',13,5,2.3995379106666896), (94,'R_endsWithS',-1,18,2.1572134278343764), (38,'W_helped',-1,29,0.9484097417725005),
(96,'W_mansion',-1,13,2.3149078814680406), (86,'W_any',-1,2,2.7954051826004025), (88,'E.11',11,12,2.197492184359893),
(72,'E.28',28,11,2.692901638424271), (48,'W_sterling',-1,11,2.3077819829037525), (40,'E.24',24,26,3.7011711026137646),
(13,'W_lawson',-1,13,3.297240879683435), (24,'W_to',-1,24,4.53050982922731), (52,'E.2',2,6,3.1609626598041642),
(68,'E.30',30,28,2.900795206516394), (54,'W_week',-1,11,3.282148235551298), (10,'R_endsWithER',-1,13,2.449609048325598),
(63,'W_analysts',-1,12,1.0628309233894724), (92,'U',-1,18,1.025730795623997), (16,'E.16',16,29,1.9204482801150826),
(61,'E.0',0,12,2.6373656761539523), (18,'W_restated',-1,29,1.3818282059971994), (22,'W_commitment',-1,11,2.3459023587828263),
(87,'W_new',-1,6,2.6360680819475757), (64,'R_endsWithS',-1,12,2.442307827871107), (73,'W_support',-1,11,2.2821572827946244),
(60,'W_but',-1,0,2.728742100277871), (101,'E.6',6,13,1.8014203700324123), (56,'End.',-1,43,3.4595408822624947),
(26,'W_a',-1,2,2.677789823998926), (19,'R_endsWithED',-1,29,3.8020681451775755),(29,'E.11',11,6,2.492292680092348),
(70,'W_underlying',-1,28,2.09432344174987), (28,'W_firm',-1,11,1.9657228422125872), (75,'W_been',-1,29,2.129129599248343),
(45,'E.11',11,5,2.831083871985474), (5,'E.5',5,2,3.771637726308954), (57,'W_.',-1,43,3.4595408822624947),
(32,'E.6',6,11,2.938177772428602), (100,'W_last',-1,6,3.6718959040221013), (15,'W_''s',-1,16,5.074336301920056),
(1,'W_chancellor',-1,13,2.232243177253394), (34,'E.11',11,31,2.703318944384476), (8,'U',-1,13,1.7587494934234644),
(89,'W_measures',-1,12,2.2889484722940523), (0,'S.',-1,13,3.3985336034127775), (97,'W_house',-1,13,3.6015166265995555),
(44,'W_freefall',-1,11,1.8006899570542791), (62,'U',-1,12,1.3209737215186312), (20,'E.29',29,11,1.5449358644830369),
(9,'W_exchequer',-1,13,2.721269156136535), (80,'W_chancellor',-1,11,1.796888201507293), (21,'U',-1,11,1.4739961541941164),
(102,'W_thursday',-1,13,3.858752037897991), (37,'E.31',31,29,3.3704387252974217), (51,'R_endsWithER',-1,5,1.523379944589324),
(6,'W_the',-1,2,4.179075274351749), (95,'E.18',18,13,2.0919068469315785), (42,'W_prevent',-1,26,1.9130956387223363),
(93,'W_his',-1,18,2.440330155099354), (36,'R_endsWithS',-1,31,2.092642289935937), (35,'W_has',-1,31,2.452210128238285),
(14,'E.13',13,16,1.8094401262382602), (23,'E.11',11,24,3.4046169042228858), (30,'U',-1,6,1.7397221015334599),
(103,'E.13',13,43,2.1545378214335686), (78,'E.29',29,5,1.8987300180101108), (49,'R_endsWithIng',-1,11,1.8058171678503487),
(77,'W_eroded',-1,29,1.4718301974078694), (3,'U',-1,5,1.216476575073629), (90,'E.12',12,5,1.9609403598006498),
(50,'W_over',-1,5,1.5851324863666807), (43,'E.26',26,2,3.399063749955467), (11,'E.13',13,13,2.9624483035010245),
(91,'E.5',5,18,2.1990281868655663), (66,'U',-1,30,1.4276079759935312), (76,'E.29',29,29,1.6793149556330953),
(65,'E.12',12,30,2.4029290723287544), (12,'W_nigel',-1,13,2.0372945323878744), (85,'U',-1,2,-0.4472670967738198),
(17,'U',-1,29,1.6546332503126648), (98,'E.13',13,11,1.9289929156050765), (84,'W_announce',-1,26,1.8999638426719463),
(99,'W_speech',-1,11,3.570619697477909), (69,'U',-1,28,0.8360198402652405), (74,'W_for',-1,5,4.0872624914833775),
(31,'W_monetary',-1,6,3.3195839909271014), (59,'U',-1,0,1.6822590692281079), (82,'E.16',16,11,2.3919190723943817),
(58,'S.',-1,0,2.3563399364223314), (67,'W_reckon',-1,30,2.397247084073005);
analyze crf_feature_test;
-- Dictionary table
CREATE TABLE crf_dictionary (token text,total integer);
INSERT INTO crf_dictionary VALUES
('freefall',1), ('policy',2), ('measures',1), ('commitment',1),
('new',1), ('speech',1), ('''s',2), ('reckon',1),
('underlying',1),('week',1), ('prevent',1), ('has',2),
('failure',1), ('restated',1),('announce',1), ('thursday',1),
('but',1), ('lawson',1), ('last',1), ('firm',1),
('exchequer',1), ('helped',1), ('sterling',2), ('been',1),
('his',1), ('.',2), ('the',3), ('chancellor',2),
('chancellor',2),('in',2), ('any',1), ('analysts',1),
('of',1), ('support',1), ('by',1), ('over',1),
('for',1), ('monetary',1),('mansion',1), ('eroded',1),
('house',1), ('a',2), ('nigel',1), ('to',3),
('past',1);
analyze crf_dictionary;
-- Regex table
CREATE TABLE crf_regex (pattern text,name text);
INSERT INTO crf_regex VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
analyze crf_regex;
-- Labels table
CREATE TABLE test_crf_label (id integer,label character varying);
INSERT INTO test_crf_label VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze test_crf_label;
-- Segment table
CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO test_segmenttbl VALUES
(0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26),
(5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26),
(10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26),
(15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26),
(20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26),
(25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28),
(3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28),
(8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28),
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
analyze test_segmenttbl;
-- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl');
-- Expected viterbi labeling result
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction VALUES
(1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'),
(1,5,'lawson','NNP'), (1,6,'''s','POS'), (1,7,'restated','VBN'), (1,8,'commitment','NN'), (1,9,'to','TO'),
(1,10,'a','DT'), (1,11,'firm','NN'), (1,12,'monetary','JJ'), (1,13,'policy','NN'), (1,14,'has','VBZ'),
(1,15,'helped','VBN'), (1,16,'to','TO'), (1,17,'prevent','VB'), (1,18,'a','DT'), (1,19,'freefall','NN'),
(1,20,'in','IN'), (1,21,'sterling','NN'), (1,22,'over','IN'), (1,23,'the','DT'), (1,24,'past','JJ'),
(1,25,'week','NN'), (1,26,'.','.'), (2,0,'but','CC'), (2,1,'analysts','NNS'), (2,2,'reckon','VBP'),
(2,3,'underlying','VBG'),(2,4,'support','NN'), (2,5,'for','IN'), (2,6,'sterling','NN'), (2,7,'has','VBZ'),
(2,8,'been','VBN'), (2,9,'eroded','VBN'), (2,10,'by','IN'), (2,11,'the','DT'), (2,12,'chancellor','NN'),
(2,13,'''s','POS'), (2,14,'failure','NN'), (2,15,'to','TO'), (2,16,'announce','VB'), (2,17,'any','DT'),
(2,18,'new','JJ'), (2,19,'policy','NN'), (2,20,'measures','NNS'), (2,21,'in','IN'), (2,22,'his','PRP$'),
(2,23,'mansion','NNP'), (2,24,'house','NNP'), (2,25,'speech','NN'), (2,26,'last','JJ'), (2,27,'thursday','NNP'),
(2,28,'.','.'), (3,0,'his','NNP'), (3,1,'actions','NNP'), (3,2,'prevent','NNP'), (3,3,'disaster','NNP'),
(3,4,'.','.');
SELECT vcrf_label(
'test_segmenttbl',
'viterbi_mtbl',
'viterbi_rtbl',
'test_crf_label',
'extraction');
-- Compare the expected result and the viterbi extraction result. It succeeds
-- only if the two tables are the same.
SELECT assert(s1.count+s2.count = 0, 'Labels predicted do not match expected labels.')
FROM (
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM extraction
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM extraction
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction
) AS U
)s2;