blob: d258bb14bca540310610ec54b8d346ea4bbd55c4 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
-- Documents table
-- CREATE TABLE crf_document (id integer,text text);
-- INSERT INTO crf_document VALUES
-- (1,E'Chancellor of the Exchequer Nigel Lawson ''s restated commitment to a firm monetary policy has helped to prevent a freefall in sterling over the past week .'),
-- (2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'),
-- (3,E'His actions prevent disaster.');
-- analyze crf_document;
-- Features table
CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test VALUES
(7,'E.2',2,13,0.8957754882780667), (79,'W_by',-1,5,2.4007337770109434), (41,'U',-1,26,1.1971775735896324),
(81,'E.11',11,16,2.5835310903807915), (55,'E.11',11,43,1.7591747767619539), (39,'E.29',29,24,0.6423980041639374),
(27,'E.2',2,11,4.021547537882094), (46,'W_in',-1,5,4.561697530405777), (53,'W_past',-1,6,2.425102532944761),
(25,'E.24',24,2,2.6245019400744196), (47,'E.5',5,11,2.475011699159048), (33,'W_policy',-1,11,3.279737748615338),
(4,'W_of',-1,5,2.874451743664169), (83,'W_failure',-1,11,1.6214155026731378), (71,'R_endsWithIng',-1,28,1.944154984738804),
(2,'E.13',13,5,2.3995379106666896), (94,'R_endsWithS',-1,18,2.1572134278343764), (38,'W_helped',-1,29,0.9484097417725005),
(96,'W_mansion',-1,13,2.3149078814680406), (86,'W_any',-1,2,2.7954051826004025), (88,'E.11',11,12,2.197492184359893),
(72,'E.28',28,11,2.692901638424271), (48,'W_sterling',-1,11,2.3077819829037525), (40,'E.24',24,26,3.7011711026137646),
(13,'W_lawson',-1,13,3.297240879683435), (24,'W_to',-1,24,4.53050982922731), (52,'E.2',2,6,3.1609626598041642),
(68,'E.30',30,28,2.900795206516394), (54,'W_week',-1,11,3.282148235551298), (10,'R_endsWithER',-1,13,2.449609048325598),
(63,'W_analysts',-1,12,1.0628309233894724), (92,'U',-1,18,1.025730795623997), (16,'E.16',16,29,1.9204482801150826),
(61,'E.0',0,12,2.6373656761539523), (18,'W_restated',-1,29,1.3818282059971994), (22,'W_commitment',-1,11,2.3459023587828263),
(87,'W_new',-1,6,2.6360680819475757), (64,'R_endsWithS',-1,12,2.442307827871107), (73,'W_support',-1,11,2.2821572827946244),
(60,'W_but',-1,0,2.728742100277871), (101,'E.6',6,13,1.8014203700324123), (56,'End.',-1,43,3.4595408822624947),
(26,'W_a',-1,2,2.677789823998926), (19,'R_endsWithED',-1,29,3.8020681451775755),(29,'E.11',11,6,2.492292680092348),
(70,'W_underlying',-1,28,2.09432344174987), (28,'W_firm',-1,11,1.9657228422125872), (75,'W_been',-1,29,2.129129599248343),
(45,'E.11',11,5,2.831083871985474), (5,'E.5',5,2,3.771637726308954), (57,'W_.',-1,43,3.4595408822624947),
(32,'E.6',6,11,2.938177772428602), (100,'W_last',-1,6,3.6718959040221013), (15,'W_''s',-1,16,5.074336301920056),
(1,'W_chancellor',-1,13,2.232243177253394), (34,'E.11',11,31,2.703318944384476), (8,'U',-1,13,1.7587494934234644),
(89,'W_measures',-1,12,2.2889484722940523), (0,'S.',-1,13,3.3985336034127775), (97,'W_house',-1,13,3.6015166265995555),
(44,'W_freefall',-1,11,1.8006899570542791), (62,'U',-1,12,1.3209737215186312), (20,'E.29',29,11,1.5449358644830369),
(9,'W_exchequer',-1,13,2.721269156136535), (80,'W_chancellor',-1,11,1.796888201507293), (21,'U',-1,11,1.4739961541941164),
(102,'W_thursday',-1,13,3.858752037897991), (37,'E.31',31,29,3.3704387252974217), (51,'R_endsWithER',-1,5,1.523379944589324),
(6,'W_the',-1,2,4.179075274351749), (95,'E.18',18,13,2.0919068469315785), (42,'W_prevent',-1,26,1.9130956387223363),
(93,'W_his',-1,18,2.440330155099354), (36,'R_endsWithS',-1,31,2.092642289935937), (35,'W_has',-1,31,2.452210128238285),
(14,'E.13',13,16,1.8094401262382602), (23,'E.11',11,24,3.4046169042228858), (30,'U',-1,6,1.7397221015334599),
(103,'E.13',13,43,2.1545378214335686), (78,'E.29',29,5,1.8987300180101108), (49,'R_endsWithIng',-1,11,1.8058171678503487),
(77,'W_eroded',-1,29,1.4718301974078694), (3,'U',-1,5,1.216476575073629), (90,'E.12',12,5,1.9609403598006498),
(50,'W_over',-1,5,1.5851324863666807), (43,'E.26',26,2,3.399063749955467), (11,'E.13',13,13,2.9624483035010245),
(91,'E.5',5,18,2.1990281868655663), (66,'U',-1,30,1.4276079759935312), (76,'E.29',29,29,1.6793149556330953),
(65,'E.12',12,30,2.4029290723287544), (12,'W_nigel',-1,13,2.0372945323878744), (85,'U',-1,2,-0.4472670967738198),
(17,'U',-1,29,1.6546332503126648), (98,'E.13',13,11,1.9289929156050765), (84,'W_announce',-1,26,1.8999638426719463),
(99,'W_speech',-1,11,3.570619697477909), (69,'U',-1,28,0.8360198402652405), (74,'W_for',-1,5,4.0872624914833775),
(31,'W_monetary',-1,6,3.3195839909271014), (59,'U',-1,0,1.6822590692281079), (82,'E.16',16,11,2.3919190723943817),
(58,'S.',-1,0,2.3563399364223314), (67,'W_reckon',-1,30,2.397247084073005);
-- Dictionary table
CREATE TABLE crf_dictionary (token text,total integer);
INSERT INTO crf_dictionary VALUES
('freefall',1), ('policy',2), ('measures',1), ('commitment',1),
('new',1), ('speech',1), ('''s',2), ('reckon',1),
('underlying',1),('week',1), ('prevent',1), ('has',2),
('failure',1), ('restated',1),('announce',1), ('thursday',1),
('but',1), ('lawson',1), ('last',1), ('firm',1),
('exchequer',1), ('helped',1), ('sterling',2), ('been',1),
('his',1), ('.',2), ('the',3), ('chancellor',2),
('chancellor',2),('in',2), ('any',1), ('analysts',1),
('of',1), ('support',1), ('by',1), ('over',1),
('for',1), ('monetary',1),('mansion',1), ('eroded',1),
('house',1), ('a',2), ('nigel',1), ('to',3),
('past',1);
-- Regex table
CREATE TABLE crf_regex (pattern text,name text);
INSERT INTO crf_regex VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
-- Labels table
CREATE TABLE test_crf_label (id integer,label character varying);
INSERT INTO test_crf_label VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
-- Segment table
CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO test_segmenttbl VALUES
(0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26),
(5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26),
(10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26),
(15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26),
(20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26),
(25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28),
(3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28),
(8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28),
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
-- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl');