blob: 92c6c5b1b6ff17703f00ccc01bce91c3f6c4aaf8 [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Setup:
---------------------------------------------------------------------------
CREATE FUNCTION install_test() RETURNS TEXT AS $$
DECLARE
result_count INT;
result TEXT;
temp0 TEXT;
temp1 TEXT;
temp2 TEXT;
temp3 TEXT;
temp4 TEXT;
BEGIN
-- Documents table
CREATE TABLE textfex_document (id integer,text text);
INSERT INTO textfex_document VALUES
(1,E'Chancellor of the Exchequer Nigel Lawson ''s restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week .'),
(2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'),
(3,E'His actions prevent disaster.');
analyze textfex_document;
-- Features table
CREATE TABLE textfex_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO textfex_feature VALUES
(7,'E.2',2,13,0.8957754882780667), (79,'W_by',-1,5,2.4007337770109434), (41,'U',-1,26,1.1971775735896324),
(81,'E.11',11,16,2.5835310903807915), (55,'E.11',11,43,1.7591747767619539), (39,'E.29',29,24,0.6423980041639374),
(27,'E.2',2,11,4.021547537882094), (46,'W_in',-1,5,4.561697530405777), (53,'W_past',-1,6,2.425102532944761),
(25,'E.24',24,2,2.6245019400744196), (47,'E.5',5,11,2.475011699159048), (33,'W_policy',-1,11,3.279737748615338),
(4,'W_of',-1,5,2.874451743664169), (83,'W_failure',-1,11,1.6214155026731378), (71,'R_endsWithIng',-1,28,1.944154984738804),
(2,'E.13',13,5,2.3995379106666896), (94,'R_endsWithS',-1,18,2.1572134278343764), (38,'W_helped',-1,29,0.9484097417725005),
(96,'W_mansion',-1,13,2.3149078814680406), (86,'W_any',-1,2,2.7954051826004025), (88,'E.11',11,12,2.197492184359893),
(72,'E.28',28,11,2.692901638424271), (48,'W_sterling',-1,11,2.3077819829037525), (40,'E.24',24,26,3.7011711026137646),
(13,'W_lawson',-1,13,3.297240879683435), (24,'W_to',-1,24,4.53050982922731), (52,'E.2',2,6,3.1609626598041642),
(68,'E.30',30,28,2.900795206516394), (54,'W_week',-1,11,3.282148235551298), (10,'R_endsWithER',-1,13,2.449609048325598),
(63,'W_analysts',-1,12,1.0628309233894724), (92,'U',-1,18,1.025730795623997), (16,'E.16',16,29,1.9204482801150826),
(61,'E.0',0,12,2.6373656761539523), (18,'W_restated',-1,29,1.3818282059971994), (22,'W_commitment',-1,11,2.3459023587828263),
(87,'W_new',-1,6,2.6360680819475757), (64,'R_endsWithS',-1,12,2.442307827871107), (73,'W_support',-1,11,2.2821572827946244),
(60,'W_but',-1,0,2.728742100277871), (101,'E.6',6,13,1.8014203700324123), (56,'End.',-1,43,3.4595408822624947),
(26,'W_a',-1,2,2.677789823998926), (19,'R_endsWithED',-1,29,3.8020681451775755),(29,'E.11',11,6,2.492292680092348),
(70,'W_underlying',-1,28,2.09432344174987), (28,'W_firm',-1,11,1.9657228422125872), (75,'W_been',-1,29,2.129129599248343),
(45,'E.11',11,5,2.831083871985474), (5,'E.5',5,2,3.771637726308954), (57,'W_.',-1,43,3.4595408822624947),
(32,'E.6',6,11,2.938177772428602), (100,'W_last',-1,6,3.6718959040221013), (15,'W_''s',-1,16,5.074336301920056),
(1,'W_chancellor',-1,13,2.232243177253394), (34,'E.11',11,31,2.703318944384476), (8,'U',-1,13,1.7587494934234644),
(89,'W_measures',-1,12,2.2889484722940523), (0,'S.',-1,13,3.3985336034127775), (97,'W_house',-1,13,3.6015166265995555),
(44,'W_freefall',-1,11,1.8006899570542791), (62,'U',-1,12,1.3209737215186312), (20,'E.29',29,11,1.5449358644830369),
(9,'W_exchequer',-1,13,2.721269156136535), (80,'W_chancellor',-1,11,1.796888201507293), (21,'U',-1,11,1.4739961541941164),
(102,'W_thursday',-1,13,3.858752037897991), (37,'E.31',31,29,3.3704387252974217), (51,'R_endsWithER',-1,5,1.523379944589324),
(6,'W_the',-1,2,4.179075274351749), (95,'E.18',18,13,2.0919068469315785), (42,'W_prevent',-1,26,1.9130956387223363),
(93,'W_his',-1,18,2.440330155099354), (36,'R_endsWithS',-1,31,2.092642289935937), (35,'W_has',-1,31,2.452210128238285),
(14,'E.13',13,16,1.8094401262382602), (23,'E.11',11,24,3.4046169042228858), (30,'U',-1,6,1.7397221015334599),
(103,'E.13',13,43,2.1545378214335686), (78,'E.29',29,5,1.8987300180101108), (49,'R_endsWithIng',-1,11,1.8058171678503487),
(77,'W_eroded',-1,29,1.4718301974078694), (3,'U',-1,5,1.216476575073629), (90,'E.12',12,5,1.9609403598006498),
(50,'W_over',-1,5,1.5851324863666807), (43,'E.26',26,2,3.399063749955467), (11,'E.13',13,13,2.9624483035010245),
(91,'E.5',5,18,2.1990281868655663), (66,'U',-1,30,1.4276079759935312), (76,'E.29',29,29,1.6793149556330953),
(65,'E.12',12,30,2.4029290723287544), (12,'W_nigel',-1,13,2.0372945323878744), (85,'U',-1,2,-0.4472670967738198),
(17,'U',-1,29,1.6546332503126648), (98,'E.13',13,11,1.9289929156050765), (84,'W_announce',-1,26,1.8999638426719463),
(99,'W_speech',-1,11,3.570619697477909), (69,'U',-1,28,0.8360198402652405), (74,'W_for',-1,5,4.0872624914833775),
(31,'W_monetary',-1,6,3.3195839909271014), (59,'U',-1,0,1.6822590692281079), (82,'E.16',16,11,2.3919190723943817),
(58,'S.',-1,0,2.3563399364223314), (67,'W_reckon',-1,30,2.397247084073005);
analyze textfex_feature;
-- Dictionary table
CREATE TABLE textfex_dictionary (token text,token_id integer,label text,count integer, total integer);
INSERT INTO textfex_dictionary VALUES
('freefall',17,11,1,1), ('policy',13,11,2,2), ('measures',37,12,1,1), ('commitment',8,11,1,1),
('new',36,6,1,1), ('speech',41,11,1,1), ('''s',6,16,2,2), ('reckon',26,30,1,1),
('underlying',27,28,1,1),('week',22,11,1,1), ('prevent',16,26,1,1), ('has',14,31,2,2),
('failure',33,11,1,1), ('restated',7,29,1,1),('announce',34,26,1,1), ('thursday',43,13,1,1),
('but',24,0,1,1), ('lawson',5,13,1,1), ('last',42,6,1,1), ('firm',11,11,1,1),
('exchequer',3,13,1,1), ('helped',15,29,1,1), ('sterling',19,11,2,2), ('been',30,29,1,1),
('his',38,18,1,1), ('.',23,43,2,2), ('the',2,2,3,3), ('chancellor',0,11,1,2),
('chancellor',0,13,1,2), ('in',18,5,2,2), ('any',35,2,1,1), ('analysts',25,12,1,1),
('of',1,5,1,1), ('support',28,11,1,1),('by',32,5,1,1), ('over',20,5,1,1),
('for',29,5,1,1), ('monetary',12,6,1,1),('mansion',39,13,1,1), ('eroded',31,29,1,1),
('house',40,13,1,1), ('a',10,2,2,2), ('nigel',4,13,1,1), ('to',9,24,3,3),
('past',21,6,1,1);
analyze textfex_dictionary;
-- Regex table
CREATE TABLE textfex_regex (pattern text,name text);
INSERT INTO textfex_regex VALUES
('^[A-Z][a-z]+$','InitCapital%'), ('^[A-Z]+$','isAllCapital%'), ('^.*[0-9]+.*$','containsDigit%'),
('^.+[.]$','endsWithDot%'), ('^.+[,]$','endsWithComma%'), ('^.+er$','endsWithER%'),
('^.+est$','endsWithEst%'), ('^.+ed$','endsWithED%'), ('^.+s$','endsWithS%'),
('^.+ing$','endsWithIng%'), ('^.+ly$','endsWithly%'), ('^.+-.+$','isDashSeparatedWords%'),
('^.*@.*$','isEmailId%');
analyze textfex_regex;
-- Labels table
CREATE TABLE textfex_label (id integer,label character varying);
INSERT INTO textfex_label VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze textfex_label;
-- Segment table
CREATE TABLE textfex_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO textfex_segmenttbl VALUES
(0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26),
(5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26),
(10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26),
(15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26),
(20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26),
(25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28),
(3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28),
(8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28),
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
analyze textfex_segmenttbl;
-- extract features for tokens stored in segmenttbl
PERFORM MADLIB_SCHEMA.text_feature_extraction('textfex_segmenttbl','textfex_dictionary','textfex_label','textfex_regex','textfex_feature','viterbi_mtbl','viterbi_rtbl');
-- Expected viterbi labeling result
CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction VALUES
(1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'),
(1,5,'lawson','NNP'), (1,6,'''s','POS'), (1,7,'restated','VBN'), (1,8,'commitment','NN'), (1,9,'to','TO'),
(1,10,'a','DT'), (1,11,'firm','NN'), (1,12,'monetary','JJ'), (1,13,'policy','NN'), (1,14,'has','VBZ'),
(1,15,'helped','VBN'), (1,16,'to','TO'), (1,17,'prevent','VB'), (1,18,'a','DT'), (1,19,'freefall','NN'),
(1,20,'in','IN'), (1,21,'sterling','NN'), (1,22,'over','IN'), (1,23,'the','DT'), (1,24,'past','JJ'),
(1,25,'week','NN'), (1,26,'.','.'), (2,0,'but','CC'), (2,1,'analysts','NNS'), (2,2,'reckon','VBP'),
(2,3,'underlying','VBG'),(2,4,'support','NN'), (2,5,'for','IN'), (2,6,'sterling','NN'), (2,7,'has','VBZ'),
(2,8,'been','VBN'), (2,9,'eroded','VBN'), (2,10,'by','IN'), (2,11,'the','DT'), (2,12,'chancellor','NN'),
(2,13,'''s','POS'), (2,14,'failure','NN'), (2,15,'to','TO'), (2,16,'announce','VB'), (2,17,'any','DT'),
(2,18,'new','JJ'), (2,19,'policy','NN'), (2,20,'measures','NNS'), (2,21,'in','IN'), (2,22,'his','PRP$'),
(2,23,'mansion','NNP'), (2,24,'house','NNP'), (2,25,'speech','NN'), (2,26,'last','JJ'), (2,27,'thursday','NNP'),
(2,28,'.','.'), (3,0,'his','NNP'), (3,1,'actions','NNP'), (3,2,'prevent','NNP'), (3,3,'disaster','NNP'),
(3,4,'.','.');
PERFORM MADLIB_SCHEMA.vcrf_label(
'textfex_segmenttbl',
'viterbi_mtbl',
'viterbi_rtbl',
'textfex_label',
'extraction');
-- Compare the expected result and the viterbi extraction result. It succeeds
-- only if the two tables are the same.
SELECT s1.count + s2.count INTO result_count
FROM (
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM extraction
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM extraction
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction
) AS U
)s2;
SELECT INTO result CASE WHEN (result_count = 0) THEN 'PASS' ELSE 'FAIL' END;
IF result = 'FAIL' THEN
RAISE EXCEPTION 'Failed install check %', result_count;
END IF;
RETURN result;
END
$$ language plpgsql;
---------------------------------------------------------------------------
-- Test:
---------------------------------------------------------------------------
SELECT install_test();