| --------------------------------------------------------------------------- |
| -- Rules: |
| -- ------ |
| -- 1) Any DB objects should be created w/o schema prefix, |
| -- since this file is executed in a separate schema context. |
| -- 2) There should be no DROP statements in this script, since |
| -- all objects created in the default schema will be cleaned-up outside. |
| --------------------------------------------------------------------------- |
| |
| -- Documents table |
| CREATE TABLE crf_document (id integer,text text); |
| INSERT INTO crf_document VALUES |
| (1,E'Chancellor of the Exchequer Nigel Lawson ''s restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week .'), |
| (2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'), |
| (3,E'His actions prevent disaster.'); |
| analyze crf_document; |
| |
| -- Features table |
| CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float); |
| INSERT INTO crf_feature_test VALUES |
| (7,'E.2',2,13,0.8957754882780667), (79,'W_by',-1,5,2.4007337770109434), (41,'U',-1,26,1.1971775735896324), |
| (81,'E.11',11,16,2.5835310903807915), (55,'E.11',11,43,1.7591747767619539), (39,'E.29',29,24,0.6423980041639374), |
| (27,'E.2',2,11,4.021547537882094), (46,'W_in',-1,5,4.561697530405777), (53,'W_past',-1,6,2.425102532944761), |
| (25,'E.24',24,2,2.6245019400744196), (47,'E.5',5,11,2.475011699159048), (33,'W_policy',-1,11,3.279737748615338), |
| (4,'W_of',-1,5,2.874451743664169), (83,'W_failure',-1,11,1.6214155026731378), (71,'R_endsWithIng',-1,28,1.944154984738804), |
| (2,'E.13',13,5,2.3995379106666896), (94,'R_endsWithS',-1,18,2.1572134278343764), (38,'W_helped',-1,29,0.9484097417725005), |
| (96,'W_mansion',-1,13,2.3149078814680406), (86,'W_any',-1,2,2.7954051826004025), (88,'E.11',11,12,2.197492184359893), |
| (72,'E.28',28,11,2.692901638424271), (48,'W_sterling',-1,11,2.3077819829037525), (40,'E.24',24,26,3.7011711026137646), |
| (13,'W_lawson',-1,13,3.297240879683435), (24,'W_to',-1,24,4.53050982922731), (52,'E.2',2,6,3.1609626598041642), |
| (68,'E.30',30,28,2.900795206516394), (54,'W_week',-1,11,3.282148235551298), (10,'R_endsWithER',-1,13,2.449609048325598), |
| (63,'W_analysts',-1,12,1.0628309233894724), (92,'U',-1,18,1.025730795623997), (16,'E.16',16,29,1.9204482801150826), |
| (61,'E.0',0,12,2.6373656761539523), (18,'W_restated',-1,29,1.3818282059971994), (22,'W_commitment',-1,11,2.3459023587828263), |
| (87,'W_new',-1,6,2.6360680819475757), (64,'R_endsWithS',-1,12,2.442307827871107), (73,'W_support',-1,11,2.2821572827946244), |
| (60,'W_but',-1,0,2.728742100277871), (101,'E.6',6,13,1.8014203700324123), (56,'End.',-1,43,3.4595408822624947), |
| (26,'W_a',-1,2,2.677789823998926), (19,'R_endsWithED',-1,29,3.8020681451775755),(29,'E.11',11,6,2.492292680092348), |
| (70,'W_underlying',-1,28,2.09432344174987), (28,'W_firm',-1,11,1.9657228422125872), (75,'W_been',-1,29,2.129129599248343), |
| (45,'E.11',11,5,2.831083871985474), (5,'E.5',5,2,3.771637726308954), (57,'W_.',-1,43,3.4595408822624947), |
| (32,'E.6',6,11,2.938177772428602), (100,'W_last',-1,6,3.6718959040221013), (15,'W_''s',-1,16,5.074336301920056), |
| (1,'W_chancellor',-1,13,2.232243177253394), (34,'E.11',11,31,2.703318944384476), (8,'U',-1,13,1.7587494934234644), |
| (89,'W_measures',-1,12,2.2889484722940523), (0,'S.',-1,13,3.3985336034127775), (97,'W_house',-1,13,3.6015166265995555), |
| (44,'W_freefall',-1,11,1.8006899570542791), (62,'U',-1,12,1.3209737215186312), (20,'E.29',29,11,1.5449358644830369), |
| (9,'W_exchequer',-1,13,2.721269156136535), (80,'W_chancellor',-1,11,1.796888201507293), (21,'U',-1,11,1.4739961541941164), |
| (102,'W_thursday',-1,13,3.858752037897991), (37,'E.31',31,29,3.3704387252974217), (51,'R_endsWithER',-1,5,1.523379944589324), |
| (6,'W_the',-1,2,4.179075274351749), (95,'E.18',18,13,2.0919068469315785), (42,'W_prevent',-1,26,1.9130956387223363), |
| (93,'W_his',-1,18,2.440330155099354), (36,'R_endsWithS',-1,31,2.092642289935937), (35,'W_has',-1,31,2.452210128238285), |
| (14,'E.13',13,16,1.8094401262382602), (23,'E.11',11,24,3.4046169042228858), (30,'U',-1,6,1.7397221015334599), |
| (103,'E.13',13,43,2.1545378214335686), (78,'E.29',29,5,1.8987300180101108), (49,'R_endsWithIng',-1,11,1.8058171678503487), |
| (77,'W_eroded',-1,29,1.4718301974078694), (3,'U',-1,5,1.216476575073629), (90,'E.12',12,5,1.9609403598006498), |
| (50,'W_over',-1,5,1.5851324863666807), (43,'E.26',26,2,3.399063749955467), (11,'E.13',13,13,2.9624483035010245), |
| (91,'E.5',5,18,2.1990281868655663), (66,'U',-1,30,1.4276079759935312), (76,'E.29',29,29,1.6793149556330953), |
| (65,'E.12',12,30,2.4029290723287544), (12,'W_nigel',-1,13,2.0372945323878744), (85,'U',-1,2,-0.4472670967738198), |
| (17,'U',-1,29,1.6546332503126648), (98,'E.13',13,11,1.9289929156050765), (84,'W_announce',-1,26,1.8999638426719463), |
| (99,'W_speech',-1,11,3.570619697477909), (69,'U',-1,28,0.8360198402652405), (74,'W_for',-1,5,4.0872624914833775), |
| (31,'W_monetary',-1,6,3.3195839909271014), (59,'U',-1,0,1.6822590692281079), (82,'E.16',16,11,2.3919190723943817), |
| (58,'S.',-1,0,2.3563399364223314), (67,'W_reckon',-1,30,2.397247084073005); |
| analyze crf_feature_test; |
| |
| -- Dictionary table |
| CREATE TABLE crf_dictionary (token text,total integer); |
| INSERT INTO crf_dictionary VALUES |
| ('freefall',1), ('policy',2), ('measures',1), ('commitment',1), |
| ('new',1), ('speech',1), ('''s',2), ('reckon',1), |
| ('underlying',1),('week',1), ('prevent',1), ('has',2), |
| ('failure',1), ('restated',1),('announce',1), ('thursday',1), |
| ('but',1), ('lawson',1), ('last',1), ('firm',1), |
| ('exchequer',1), ('helped',1), ('sterling',2), ('been',1), |
| ('his',1), ('.',2), ('the',3), ('chancellor',2), |
| ('chancellor',2),('in',2), ('any',1), ('analysts',1), |
| ('of',1), ('support',1), ('by',1), ('over',1), |
| ('for',1), ('monetary',1),('mansion',1), ('eroded',1), |
| ('house',1), ('a',2), ('nigel',1), ('to',3), |
| ('past',1); |
| analyze crf_dictionary; |
| |
| -- Regex table |
| CREATE TABLE crf_regex (pattern text,name text); |
| INSERT INTO crf_regex VALUES |
| ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'), |
| ('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'), |
| ('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'), |
| ('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'), |
| ('^.*@.*$','isEmailId'); |
| analyze crf_regex; |
| |
| -- Labels table |
| CREATE TABLE test_crf_label (id integer,label character varying); |
| INSERT INTO test_crf_label VALUES |
| (0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'), |
| (9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'), |
| (18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'), |
| (27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'), |
| (36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':'); |
| analyze test_crf_label; |
| |
| -- Segment table |
| CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer); |
| INSERT INTO test_segmenttbl VALUES |
| (0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26), |
| (5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26), |
| (10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26), |
| (15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26), |
| (20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26), |
| (25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28), |
| (3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28), |
| (8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28), |
| (13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28), |
| (18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28), |
| (23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28), |
| (28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4), |
| (4,3,'.',4); |
| analyze test_segmenttbl; |
| |
| -- extract features for tokens stored in segmenttbl |
| SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl'); |
| |
| -- Expected viterbi labeling result |
| -- The result is produced from Dr. Sunita's CRF java package with the same input |
| CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying); |
| INSERT INTO expected_extraction VALUES |
| (1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'), |
| (1,5,'lawson','NNP'), (1,6,'''s','POS'), (1,7,'restated','VBN'), (1,8,'commitment','NN'), (1,9,'to','TO'), |
| (1,10,'a','DT'), (1,11,'firm','NN'), (1,12,'monetary','JJ'), (1,13,'policy','NN'), (1,14,'has','VBZ'), |
| (1,15,'helped','VBN'), (1,16,'to','TO'), (1,17,'prevent','VB'), (1,18,'a','DT'), (1,19,'freefall','NN'), |
| (1,20,'in','IN'), (1,21,'sterling','NN'), (1,22,'over','IN'), (1,23,'the','DT'), (1,24,'past','JJ'), |
| (1,25,'week','NN'), (1,26,'.','.'), (2,0,'but','CC'), (2,1,'analysts','NNS'), (2,2,'reckon','VBP'), |
| (2,3,'underlying','VBG'),(2,4,'support','NN'), (2,5,'for','IN'), (2,6,'sterling','NN'), (2,7,'has','VBZ'), |
| (2,8,'been','VBN'), (2,9,'eroded','VBN'), (2,10,'by','IN'), (2,11,'the','DT'), (2,12,'chancellor','NN'), |
| (2,13,'''s','POS'), (2,14,'failure','NN'), (2,15,'to','TO'), (2,16,'announce','VB'), (2,17,'any','DT'), |
| (2,18,'new','JJ'), (2,19,'policy','NN'), (2,20,'measures','NNS'), (2,21,'in','IN'), (2,22,'his','PRP$'), |
| (2,23,'mansion','NNP'), (2,24,'house','NNP'), (2,25,'speech','NN'), (2,26,'last','JJ'), (2,27,'thursday','NNP'), |
| (2,28,'.','.'), (3,0,'his','NNP'), (3,1,'actions','NNP'), (3,2,'prevent','NNP'), (3,3,'disaster','NNP'), |
| (3,4,'.','.'); |
| |
| SELECT vcrf_label( |
| 'test_segmenttbl', |
| 'viterbi_mtbl', |
| 'viterbi_rtbl', |
| 'test_crf_label', |
| 'extraction'); |
| |
| -- Compare the expected result and the viterbi extraction result. It succeeds |
| -- only if the two tables are the same. |
| SELECT assert(s1.count+s2.count = 0, 'Labels predicted do not match expected labels.') |
| FROM ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction |
| ) AS U |
| )s1, |
| ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction |
| ) AS U |
| )s2; |