src/ports/postgres/modules/crf/test/crf_train_small.ic.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *//* ----------------------------------------------------------------------- */

 ---------------------------------------------------------------------------
 -- Rules:
 -- ------
 -- 1) Any DB objects should be created w/o schema prefix,
 --    since this file is executed in a separate schema context.
 -- 2) There should be no DROP statements in this script, since
 --    all objects created in the default schema will be cleaned-up outside.
 --
 -- This test is not performed in those platforms which don't support
 -- ORDERED AGGREGATES.
 ---------------------------------------------------------------------------

 m4_include(`SQLCommon.m4')
 m4_changequote(<!,!>)

 m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!

 	-- Regex table
         CREATE TABLE train_regex(pattern text,name text);
         INSERT INTO train_regex VALUES
         ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
         ('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
         ('^.+[,]$','endsWithComma'),     ('^.+er$','endsWithER'),
         ('^.+est$','endsWithEst'),       ('^.+ed$','endsWithED'),
         ('^.+s$','endsWithS'),           ('^.+ing$','endsWithIng'),
         ('^.+ly$','endsWithly'),         ('^.+-.+$','isDashSeparatedWords'),
         ('^.*@.*$','isEmailId');

 	    -- Label table
         CREATE TABLE crf_label (id integer,label character varying);
         INSERT INTO crf_label VALUES
         (0,'CC'),   (1,'CD'),  (2,'DT'),    (3,'EX'),   (4,'FW'),  (5,'IN'),   (6,'JJ'),  (7,'JJR'), (8,'JJS'),
         (9,'LS'),   (10,'MD'), (11,'NN'),   (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
         (18,'PRP$'),(19,'RB'), (20,'RBR'),  (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
         (27,'VBD'), (28,'VBG'),(29,'VBN'),  (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
         (36,'$'),   (37,'#'),  (38,''''''), (39,'``'),  (40,'('),  (41,')'),   (42,','),  (43,'.'),  (44,':');

         CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);
         INSERT INTO train_segmenttbl VALUES
         (0,1,'confidence',11,36),  (1,1,'in',5,36),         (2,1,'the',2,36),         (3,1,'pound',11,36),
         (4,1,'is',31,36),          (5,1,'widely',19,36),    (6,1,'expected',29,36),   (7,1,'to',24,36),
         (8,1,'take',26,36),        (9,1,'another',2,36),    (10,1,'sharp',6,36),      (11,1,'dive',11,36),
         (12,1,'if',5,36),          (13,1,'trade',11,36),    (14,1,'figures',12,36),   (15,1,'for',5,36),
         (16,1,'september',13,36),  (17,1,',',42,36),        (18,1,'due',6,36),        (19,1,'for',5,36),
         (20,1,'release',11,36),    (21,1,'tomorrow',11,36), (22,1,',',42,36),         (23,1,'fail',26,36),
         (24,1,'to',24,36),         (25,1,'show',26,36),     (26,1,'a',2,36),          (27,1,'substantial',6,36),
         (28,1,'improvement',11,36),(29,1,'from',5,36),      (30,1,'july',13,36),      (31,1,'and',0,36),
         (32,1,'august',13,36),     (33,1,'''s',16,36),      (34,1,'near-record',6,36),(35,1,'deficits',12,36),
         (36,1,'.',43,36),          (0,2,'chancellor',13,26),(1,2,'of',5,26),          (2,2,'the',2,26),
         (3,2,'exchequer',13,26),   (4,2,'nigel',13,26),     (5,2,'lawson',13,26),     (6,2,'''s',16,26),
         (7,2,'restated',29,26),    (8,2,'commitment',11,26),(9,2,'to',24,26),         (10,2,'a',2,26),
         (11,2,'firm',11,26),       (12,2,'monetary',6,26),  (13,2,'policy',11,26),    (14,2,'has',31,26),
         (15,2,'helped',29,26),     (16,2,'to',24,26),       (17,2,'prevent',26,26),   (18,2,'a',2,26),
         (19,2,'freefall',11,26),   (20,2,'in',5,26),        (21,2,'sterling',11,26),  (22,2,'over',5,26),
         (23,2,'the',2,26),         (24,2,'past',6,26),      (25,2,'week',11,26),      (26,2,'.',43,26);


         SELECT crf_train_fgen('train_segmenttbl', 'train_regex', 'crf_label', 'train_dictionary', 'train_featuretbl','train_featureset');


 !>)
 m4_changequote(<!`!>,<!'!>)
	/* ----------------------------------------------------------------------- //*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	// ----------------------------------------------------------------------- */

	---------------------------------------------------------------------------
	-- Rules:
	-- ------
	-- 1) Any DB objects should be created w/o schema prefix,
	-- since this file is executed in a separate schema context.
	-- 2) There should be no DROP statements in this script, since
	-- all objects created in the default schema will be cleaned-up outside.
	--
	-- This test is not performed in those platforms which don't support
	-- ORDERED AGGREGATES.
	---------------------------------------------------------------------------

	m4_include(`SQLCommon.m4')
	m4_changequote(<!,!>)

	m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>,<!

	-- Regex table
	CREATE TABLE train_regex(pattern text,name text);
	INSERT INTO train_regex VALUES
	('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
	('^.[0-9]+.$','containsDigit'),('^.+[.]$','endsWithDot'),
	('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
	('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'),
	('^.+s$','endsWithS'), ('^.+ing$','endsWithIng'),
	('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
	('^.@.$','isEmailId');

	-- Label table
	CREATE TABLE crf_label (id integer,label character varying);
	INSERT INTO crf_label VALUES
	(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
	(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
	(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
	(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
	(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');

	CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);
	INSERT INTO train_segmenttbl VALUES
	(0,1,'confidence',11,36), (1,1,'in',5,36), (2,1,'the',2,36), (3,1,'pound',11,36),
	(4,1,'is',31,36), (5,1,'widely',19,36), (6,1,'expected',29,36), (7,1,'to',24,36),
	(8,1,'take',26,36), (9,1,'another',2,36), (10,1,'sharp',6,36), (11,1,'dive',11,36),
	(12,1,'if',5,36), (13,1,'trade',11,36), (14,1,'figures',12,36), (15,1,'for',5,36),
	(16,1,'september',13,36), (17,1,',',42,36), (18,1,'due',6,36), (19,1,'for',5,36),
	(20,1,'release',11,36), (21,1,'tomorrow',11,36), (22,1,',',42,36), (23,1,'fail',26,36),
	(24,1,'to',24,36), (25,1,'show',26,36), (26,1,'a',2,36), (27,1,'substantial',6,36),
	(28,1,'improvement',11,36),(29,1,'from',5,36), (30,1,'july',13,36), (31,1,'and',0,36),
	(32,1,'august',13,36), (33,1,'''s',16,36), (34,1,'near-record',6,36),(35,1,'deficits',12,36),
	(36,1,'.',43,36), (0,2,'chancellor',13,26),(1,2,'of',5,26), (2,2,'the',2,26),
	(3,2,'exchequer',13,26), (4,2,'nigel',13,26), (5,2,'lawson',13,26), (6,2,'''s',16,26),
	(7,2,'restated',29,26), (8,2,'commitment',11,26),(9,2,'to',24,26), (10,2,'a',2,26),
	(11,2,'firm',11,26), (12,2,'monetary',6,26), (13,2,'policy',11,26), (14,2,'has',31,26),
	(15,2,'helped',29,26), (16,2,'to',24,26), (17,2,'prevent',26,26), (18,2,'a',2,26),
	(19,2,'freefall',11,26), (20,2,'in',5,26), (21,2,'sterling',11,26), (22,2,'over',5,26),
	(23,2,'the',2,26), (24,2,'past',6,26), (25,2,'week',11,26), (26,2,'.',43,26);


	SELECT crf_train_fgen('train_segmenttbl', 'train_regex', 'crf_label', 'train_dictionary', 'train_featuretbl','train_featureset');


	!>)
	m4_changequote(<!`!>,<!'!>)