| --------------------------------------------------------------------------- |
| -- Rules: |
| -- ------ |
| -- 1) Any DB objects should be created w/o schema prefix, |
| -- since this file is executed in a separate schema context. |
| -- 2) There should be no DROP statements in this script, since |
| -- all objects created in the default schema will be cleaned-up outside. |
| --------------------------------------------------------------------------- |
| |
| --------------------------------------------------------------------------- |
| -- Setup: |
| --------------------------------------------------------------------------- |
| |
| CREATE FUNCTION install_test() RETURNS TEXT AS $$ |
| DECLARE |
| result_count INT; |
| result TEXT; |
| temp0 TEXT; |
| temp1 TEXT; |
| temp2 TEXT; |
| temp3 TEXT; |
| temp4 TEXT; |
| BEGIN |
| -- Documents table |
| CREATE TABLE textfex_document (id integer,text text); |
| INSERT INTO textfex_document VALUES |
| (1,E'Chancellor of the Exchequer Nigel Lawson ''s restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week .'), |
| (2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'), |
| (3,E'His actions prevent disaster.'); |
| analyze textfex_document; |
| |
| -- Features table |
| CREATE TABLE textfex_feature (id integer,name text,prev_label_id integer,label_id integer,weight float); |
| INSERT INTO textfex_feature VALUES |
| (7,'E.2',2,13,0.8957754882780667), (79,'W_by',-1,5,2.4007337770109434), (41,'U',-1,26,1.1971775735896324), |
| (81,'E.11',11,16,2.5835310903807915), (55,'E.11',11,43,1.7591747767619539), (39,'E.29',29,24,0.6423980041639374), |
| (27,'E.2',2,11,4.021547537882094), (46,'W_in',-1,5,4.561697530405777), (53,'W_past',-1,6,2.425102532944761), |
| (25,'E.24',24,2,2.6245019400744196), (47,'E.5',5,11,2.475011699159048), (33,'W_policy',-1,11,3.279737748615338), |
| (4,'W_of',-1,5,2.874451743664169), (83,'W_failure',-1,11,1.6214155026731378), (71,'R_endsWithIng',-1,28,1.944154984738804), |
| (2,'E.13',13,5,2.3995379106666896), (94,'R_endsWithS',-1,18,2.1572134278343764), (38,'W_helped',-1,29,0.9484097417725005), |
| (96,'W_mansion',-1,13,2.3149078814680406), (86,'W_any',-1,2,2.7954051826004025), (88,'E.11',11,12,2.197492184359893), |
| (72,'E.28',28,11,2.692901638424271), (48,'W_sterling',-1,11,2.3077819829037525), (40,'E.24',24,26,3.7011711026137646), |
| (13,'W_lawson',-1,13,3.297240879683435), (24,'W_to',-1,24,4.53050982922731), (52,'E.2',2,6,3.1609626598041642), |
| (68,'E.30',30,28,2.900795206516394), (54,'W_week',-1,11,3.282148235551298), (10,'R_endsWithER',-1,13,2.449609048325598), |
| (63,'W_analysts',-1,12,1.0628309233894724), (92,'U',-1,18,1.025730795623997), (16,'E.16',16,29,1.9204482801150826), |
| (61,'E.0',0,12,2.6373656761539523), (18,'W_restated',-1,29,1.3818282059971994), (22,'W_commitment',-1,11,2.3459023587828263), |
| (87,'W_new',-1,6,2.6360680819475757), (64,'R_endsWithS',-1,12,2.442307827871107), (73,'W_support',-1,11,2.2821572827946244), |
| (60,'W_but',-1,0,2.728742100277871), (101,'E.6',6,13,1.8014203700324123), (56,'End.',-1,43,3.4595408822624947), |
| (26,'W_a',-1,2,2.677789823998926), (19,'R_endsWithED',-1,29,3.8020681451775755),(29,'E.11',11,6,2.492292680092348), |
| (70,'W_underlying',-1,28,2.09432344174987), (28,'W_firm',-1,11,1.9657228422125872), (75,'W_been',-1,29,2.129129599248343), |
| (45,'E.11',11,5,2.831083871985474), (5,'E.5',5,2,3.771637726308954), (57,'W_.',-1,43,3.4595408822624947), |
| (32,'E.6',6,11,2.938177772428602), (100,'W_last',-1,6,3.6718959040221013), (15,'W_''s',-1,16,5.074336301920056), |
| (1,'W_chancellor',-1,13,2.232243177253394), (34,'E.11',11,31,2.703318944384476), (8,'U',-1,13,1.7587494934234644), |
| (89,'W_measures',-1,12,2.2889484722940523), (0,'S.',-1,13,3.3985336034127775), (97,'W_house',-1,13,3.6015166265995555), |
| (44,'W_freefall',-1,11,1.8006899570542791), (62,'U',-1,12,1.3209737215186312), (20,'E.29',29,11,1.5449358644830369), |
| (9,'W_exchequer',-1,13,2.721269156136535), (80,'W_chancellor',-1,11,1.796888201507293), (21,'U',-1,11,1.4739961541941164), |
| (102,'W_thursday',-1,13,3.858752037897991), (37,'E.31',31,29,3.3704387252974217), (51,'R_endsWithER',-1,5,1.523379944589324), |
| (6,'W_the',-1,2,4.179075274351749), (95,'E.18',18,13,2.0919068469315785), (42,'W_prevent',-1,26,1.9130956387223363), |
| (93,'W_his',-1,18,2.440330155099354), (36,'R_endsWithS',-1,31,2.092642289935937), (35,'W_has',-1,31,2.452210128238285), |
| (14,'E.13',13,16,1.8094401262382602), (23,'E.11',11,24,3.4046169042228858), (30,'U',-1,6,1.7397221015334599), |
| (103,'E.13',13,43,2.1545378214335686), (78,'E.29',29,5,1.8987300180101108), (49,'R_endsWithIng',-1,11,1.8058171678503487), |
| (77,'W_eroded',-1,29,1.4718301974078694), (3,'U',-1,5,1.216476575073629), (90,'E.12',12,5,1.9609403598006498), |
| (50,'W_over',-1,5,1.5851324863666807), (43,'E.26',26,2,3.399063749955467), (11,'E.13',13,13,2.9624483035010245), |
| (91,'E.5',5,18,2.1990281868655663), (66,'U',-1,30,1.4276079759935312), (76,'E.29',29,29,1.6793149556330953), |
| (65,'E.12',12,30,2.4029290723287544), (12,'W_nigel',-1,13,2.0372945323878744), (85,'U',-1,2,-0.4472670967738198), |
| (17,'U',-1,29,1.6546332503126648), (98,'E.13',13,11,1.9289929156050765), (84,'W_announce',-1,26,1.8999638426719463), |
| (99,'W_speech',-1,11,3.570619697477909), (69,'U',-1,28,0.8360198402652405), (74,'W_for',-1,5,4.0872624914833775), |
| (31,'W_monetary',-1,6,3.3195839909271014), (59,'U',-1,0,1.6822590692281079), (82,'E.16',16,11,2.3919190723943817), |
| (58,'S.',-1,0,2.3563399364223314), (67,'W_reckon',-1,30,2.397247084073005); |
| analyze textfex_feature; |
| |
| -- Dictionary table |
| CREATE TABLE textfex_dictionary (token text,token_id integer,label text,count integer, total integer); |
| INSERT INTO textfex_dictionary VALUES |
| ('freefall',17,11,1,1), ('policy',13,11,2,2), ('measures',37,12,1,1), ('commitment',8,11,1,1), |
| ('new',36,6,1,1), ('speech',41,11,1,1), ('''s',6,16,2,2), ('reckon',26,30,1,1), |
| ('underlying',27,28,1,1),('week',22,11,1,1), ('prevent',16,26,1,1), ('has',14,31,2,2), |
| ('failure',33,11,1,1), ('restated',7,29,1,1),('announce',34,26,1,1), ('thursday',43,13,1,1), |
| ('but',24,0,1,1), ('lawson',5,13,1,1), ('last',42,6,1,1), ('firm',11,11,1,1), |
| ('exchequer',3,13,1,1), ('helped',15,29,1,1), ('sterling',19,11,2,2), ('been',30,29,1,1), |
| ('his',38,18,1,1), ('.',23,43,2,2), ('the',2,2,3,3), ('chancellor',0,11,1,2), |
| ('chancellor',0,13,1,2), ('in',18,5,2,2), ('any',35,2,1,1), ('analysts',25,12,1,1), |
| ('of',1,5,1,1), ('support',28,11,1,1),('by',32,5,1,1), ('over',20,5,1,1), |
| ('for',29,5,1,1), ('monetary',12,6,1,1),('mansion',39,13,1,1), ('eroded',31,29,1,1), |
| ('house',40,13,1,1), ('a',10,2,2,2), ('nigel',4,13,1,1), ('to',9,24,3,3), |
| ('past',21,6,1,1); |
| analyze textfex_dictionary; |
| |
| -- Regex table |
| CREATE TABLE textfex_regex (pattern text,name text); |
| INSERT INTO textfex_regex VALUES |
| ('^[A-Z][a-z]+$','InitCapital%'), ('^[A-Z]+$','isAllCapital%'), ('^.*[0-9]+.*$','containsDigit%'), |
| ('^.+[.]$','endsWithDot%'), ('^.+[,]$','endsWithComma%'), ('^.+er$','endsWithER%'), |
| ('^.+est$','endsWithEst%'), ('^.+ed$','endsWithED%'), ('^.+s$','endsWithS%'), |
| ('^.+ing$','endsWithIng%'), ('^.+ly$','endsWithly%'), ('^.+-.+$','isDashSeparatedWords%'), |
| ('^.*@.*$','isEmailId%'); |
| analyze textfex_regex; |
| |
| -- Labels table |
| CREATE TABLE textfex_label (id integer,label character varying); |
| INSERT INTO textfex_label VALUES |
| (0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'), |
| (9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'), |
| (18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'), |
| (27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'), |
| (36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':'); |
| analyze textfex_label; |
| |
| -- Segment table |
| CREATE TABLE textfex_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer); |
| INSERT INTO textfex_segmenttbl VALUES |
| (0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26), |
| (5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26), |
| (10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26), |
| (15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26), |
| (20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26), |
| (25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28), |
| (3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28), |
| (8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28), |
| (13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28), |
| (18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28), |
| (23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28), |
| (28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4), |
| (4,3,'.',4); |
| analyze textfex_segmenttbl; |
| |
| -- extract features for tokens stored in segmenttbl |
| PERFORM MADLIB_SCHEMA.text_feature_extraction('textfex_segmenttbl','textfex_dictionary','textfex_label','textfex_regex','textfex_feature','viterbi_mtbl','viterbi_rtbl'); |
| |
| -- Expected viterbi labeling result |
| CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying); |
| INSERT INTO expected_extraction VALUES |
| (1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'), |
| (1,5,'lawson','NNP'), (1,6,'''s','POS'), (1,7,'restated','VBN'), (1,8,'commitment','NN'), (1,9,'to','TO'), |
| (1,10,'a','DT'), (1,11,'firm','NN'), (1,12,'monetary','JJ'), (1,13,'policy','NN'), (1,14,'has','VBZ'), |
| (1,15,'helped','VBN'), (1,16,'to','TO'), (1,17,'prevent','VB'), (1,18,'a','DT'), (1,19,'freefall','NN'), |
| (1,20,'in','IN'), (1,21,'sterling','NN'), (1,22,'over','IN'), (1,23,'the','DT'), (1,24,'past','JJ'), |
| (1,25,'week','NN'), (1,26,'.','.'), (2,0,'but','CC'), (2,1,'analysts','NNS'), (2,2,'reckon','VBP'), |
| (2,3,'underlying','VBG'),(2,4,'support','NN'), (2,5,'for','IN'), (2,6,'sterling','NN'), (2,7,'has','VBZ'), |
| (2,8,'been','VBN'), (2,9,'eroded','VBN'), (2,10,'by','IN'), (2,11,'the','DT'), (2,12,'chancellor','NN'), |
| (2,13,'''s','POS'), (2,14,'failure','NN'), (2,15,'to','TO'), (2,16,'announce','VB'), (2,17,'any','DT'), |
| (2,18,'new','JJ'), (2,19,'policy','NN'), (2,20,'measures','NNS'), (2,21,'in','IN'), (2,22,'his','PRP$'), |
| (2,23,'mansion','NNP'), (2,24,'house','NNP'), (2,25,'speech','NN'), (2,26,'last','JJ'), (2,27,'thursday','NNP'), |
| (2,28,'.','.'), (3,0,'his','NNP'), (3,1,'actions','NNP'), (3,2,'prevent','NNP'), (3,3,'disaster','NNP'), |
| (3,4,'.','.'); |
| |
| PERFORM MADLIB_SCHEMA.vcrf_label( |
| 'textfex_segmenttbl', |
| 'viterbi_mtbl', |
| 'viterbi_rtbl', |
| 'textfex_label', |
| 'extraction'); |
| |
| -- Compare the expected result and the viterbi extraction result. It succeeds |
| -- only if the two tables are the same. |
| SELECT s1.count + s2.count INTO result_count |
| FROM ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction |
| ) AS U |
| )s1, |
| ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction |
| ) AS U |
| )s2; |
| SELECT INTO result CASE WHEN (result_count = 0) THEN 'PASS' ELSE 'FAIL' END; |
| |
| IF result = 'FAIL' THEN |
| RAISE EXCEPTION 'Failed install check %', result_count; |
| END IF; |
| |
| RETURN result; |
| |
| END |
| $$ language plpgsql; |
| |
| --------------------------------------------------------------------------- |
| -- Test: |
| --------------------------------------------------------------------------- |
| SELECT install_test(); |