blob: 9ededaa2e5554fee18dca451b024cfbe0af3a435 [file] [log] [blame]
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
-- Features table produced by Dr. Sunita's CRF java package (as provided by Kun, "CRF2" package on github) using the training data in the file us50.train.tagged
CREATE TABLE crf_feature_test_new (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test_new VALUES
(0, 'S.', -1, 11, 0.5516753522178934),
(1, 'W_freight', -1, 11, 5.959241076198326),
(2, 'E.', 11, 12, 2.0789747316372034),
(3, 'W_rates', -1, 12, 2.0837653985907174),
(4, 'R_endsWithS', -1, 12, 5.306222451221396),
(5, 'E.', 12, 42, 1.8107793215017256),
(6, 'W_,', -1, 42, 7.286509411020296),
(7, 'E.', 42, 28, 2.222715331802022),
(8, 'U', -1, 28, 0.9492501230997201),
(9, 'W_declining', -1, 28, 0.9644339685939746),
(10, 'R_endsWithIng', -1, 28, 5.3575087838268995),
(11, 'E.', 28, 5, 2.459070565360766),
(12, 'W_for', -1, 5, 4.538367636569374),
(13, 'E.', 5, 21, 2.633416169216998),
(14, 'U', -1, 21, 1.2724051325350452),
(15, 'W_most', -1, 21, 4.742804679577188),
(16, 'E.', 21, 5, 2.3563567868687803),
(17, 'W_of', -1, 5, 5.886369615822257),
(18, 'E.', 5, 2, 3.4072332563788974),
(19, 'W_the', -1, 2, 5.003139924048258),
(20, 'E.', 2, 11, 3.4951824947429704),
(21, 'U', -1, 11, 1.3530610141204105),
(22, 'W_decade', -1, 11, 2.5363441670581475),
(23, 'E.', 11, 5, 3.168556891460435),
(24, 'U', -1, 5, 0.8846547539288385),
(25, 'W_because', -1, 5, 4.371973715457267),
(26, 'E.', 5, 5, 2.3178076749274994),
(27, 'E.', 5, 11, 3.008379674551115),
(28, 'W_competition', -1, 11, 3.7657358279072044),
(29, 'E.', 11, 29, 1.4638353616301887),
(30, 'U', -1, 29, 1.661969150148885),
(31, 'W_spurred', -1, 29, 1.0199935936286502),
(32, 'R_endsWithED', -1, 29, 5.781210636435915),
(33, 'E.', 29, 5, 2.610516071895145),
(34, 'W_by', -1, 5, 4.016756180248158),
(35, 'W_deregulation', -1, 11, 3.107044113256852),
(36, 'E.', 11, 42, 3.0918965630757156),
(37, 'E.', 42, 30, 1.8076032969941829),
(38, 'W_are', -1, 30, 3.4834944891071182),
(39, 'E.', 30, 28, 3.0246636670581872),
(40, 'W_bottoming', -1, 28, 0.9246692126583386),
(41, 'W_out', -1, 5, 4.166232235849933),
(42, 'E.', 5, 42, 1.4287166848500423),
(43, 'W_turning', -1, 28, 0.9453908838237025),
(44, 'E.', 28, 19, 1.9755638903302428),
(45, 'U', -1, 19, 2.276171985449911),
(46, 'W_upward', -1, 19, 3.0573963140454667),
(47, 'E.', 19, 0, 1.7604125260111),
(48, 'W_and', -1, 0, 6.74422233788292),
(49, 'E.', 0, 28, 1.3137638161586382),
(50, 'W_threatening', -1, 28, 1.4170133688772073),
(51, 'E.', 28, 24, 1.6388299340148602),
(52, 'W_to', -1, 24, 5.749953289380646),
(53, 'E.', 24, 26, 6.296991117616081),
(54, 'U', -1, 26, 1.5167524281785714),
(55, 'W_fuel', -1, 26, 0.8669685656391124),
(56, 'E.', 26, 11, 2.7884267413237405),
(57, 'W_inflation', -1, 11, 4.497973086341019),
(58, 'E.', 11, 43, 2.1195755732689774),
(59, 'End.', -1, 43, 4.0877966130773755),
(60, 'W_.', -1, 43, 4.0877966130773755),
(61, 'S.', -1, 13, 1.3657688665015333),
(62, 'U', -1, 13, 2.0185522749436866),
(63, 'W_trucking', -1, 13, 3.1701428979681845),
(64, 'R_endsWithIng', -1, 13, 2.1776484880257367),
(65, 'E.', 13, 42, 1.497373571911848),
(66, 'W_shipping', -1, 28, 1.4634895303741247),
(67, 'E.', 28, 0, 1.3751429994639364),
(68, 'E.', 0, 11, 2.448895430660733),
(69, 'W_air-freight', -1, 11, 3.276197718236269),
(70, 'R_isDashSeparatedWords', -1, 11, 1.390362987986471),
(71, 'U', -1, 12, -0.039795570326036186),
(72, 'W_companies', -1, 12, 1.410903273102038),
(73, 'E.', 12, 30, 2.973781128828628),
(74, 'U', -1, 30, 1.3700945582140844),
(75, 'W_have', -1, 30, 3.0882453767746747),
(76, 'E.', 30, 29, 1.9098632637338704),
(77, 'W_announced', -1, 29, 1.4380819186872527),
(78, 'E.', 29, 11, 0.9060674369789037),
(79, 'W_rate', -1, 11, 5.064883717975049),
(80, 'W_increases', -1, 12, 2.319719757676896),
(81, 'E.', 42, 29, 1.299378678795197),
(82, 'W_scheduled', -1, 29, 0.9387965126997426),
(83, 'W_this', -1, 2, 4.370856586315017),
(84, 'R_endsWithS', -1, 2, 2.076343517584421),
(85, 'W_fall', -1, 11, 2.1032438732239624),
(86, 'E.', 11, 0, 3.030829781200263),
(87, 'W_or', -1, 0, 6.18424407183645),
(88, 'E.', 0, 6, 0.868722661655972),
(89, 'U', -1, 6, 2.23345149856934),
(90, 'W_early', -1, 6, 3.384977523038234),
(91, 'R_endsWithly', -1, 6, 1.5733441079056303),
(92, 'E.', 6, 6, 2.7161792323578373),
(93, 'W_next', -1, 6, 4.2480555895859),
(94, 'E.', 6, 11, 2.9916009076630092),
(95, 'W_year', -1, 11, 3.5174064665274014),
(96, 'W_reflecting', -1, 28, 0.6113402176204787),
(97, 'E.', 28, 7, 2.819730039901505),
(98, 'U', -1, 7, -0.18779864470197277),
(99, 'W_higher', -1, 7, 3.1797755805421333),
(100, 'R_endsWithER', -1, 7, 1.5728998100190603),
(101, 'E.', 7, 12, 2.318262709087868),
(102, 'W_costs', -1, 12, 2.381538004167296),
(103, 'E.', 12, 0, 0.9855356754199541),
(104, 'E.', 0, 27, 1.8595054995497817),
(105, 'U', -1, 27, 1.5446722882388022),
(106, 'W_tightened', -1, 27, 3.8726486164808493),
(107, 'R_endsWithED', -1, 27, 2.5351002298167784),
(108, 'E.', 27, 11, 2.274821834309125),
(109, 'W_demand', -1, 11, 4.064293186476195),
(110, 'E.', 11, 11, 2.2005287976885133),
(111, 'W_transport', -1, 11, 4.716356817414893),
(112, 'S.', -1, 6, 0.6581636527243294),
(113, 'W_major', -1, 6, 2.980552933449864),
(114, 'E.', 6, 12, 3.065321079522356),
(115, 'W_shippers', -1, 12, 2.542917930136408),
(116, 'W_say', -1, 30, 2.2479773095428093),
(117, 'E.', 30, 17, 2.874166869399026),
(118, 'W_they', -1, 17, 3.143575891184453),
(119, 'E.', 17, 30, 4.2749079217250445),
(120, 'W_expect', -1, 30, 3.595622140488586),
(121, 'E.', 30, 11, 1.6611422263789781),
(122, 'E.', 12, 24, 1.1213816226246236),
(123, 'W_rise', -1, 26, 1.5778724980775618),
(124, 'E.', 26, 5, 2.568809140561043),
(125, 'W_at', -1, 5, 3.9554245620023827),
(126, 'E.', 5, 8, 2.6559245091013293),
(127, 'U', -1, 8, 0.9950701292906889),
(128, 'W_least', -1, 8, 4.103392620219655),
(129, 'E.', 8, 19, 3.709110073868926),
(130, 'W_as', -1, 19, 3.1580690701037972),
(131, 'R_endsWithS', -1, 19, 1.0980845299769795),
(132, 'E.', 19, 19, 2.1155578516782),
(133, 'W_fast', -1, 19, 3.951081121254258),
(134, 'E.', 19, 5, 2.364546163338331),
(135, 'W_as', -1, 5, 4.459742427306834),
(136, 'R_endsWithS', -1, 5, 0.27528290655402593),
(137, 'E.', 0, 19, 2.002143530981041),
(138, 'W_maybe', -1, 19, 3.5081392526668402),
(139, 'E.', 19, 20, 2.2175425248487377),
(140, 'U', -1, 20, 0.5355418339873245),
(141, 'W_faster', -1, 20, 3.71210878977271),
(142, 'R_endsWithER', -1, 20, 3.472793820076381),
(143, 'E.', 20, 5, 1.3108157211407567),
(144, 'W_in', -1, 5, 3.840521231387154),
(145, 'E.', 2, 6, 2.8789088935047493),
(146, 'W_few', -1, 6, 2.4522548748428017),
(147, 'W_years', -1, 12, 1.7038384234850026),
(148, 'E.', 12, 43, 1.3730441332544099),
(149, 'S.', -1, 2, 1.926568025628669),
(150, 'U', -1, 2, 0.18622978543338722),
(151, 'W_that', -1, 2, 3.7701267071548883),
(152, 'E.', 2, 31, 3.0418354933422265),
(153, 'U', -1, 31, 1.1440561095338802),
(154, 'W_''s', -1, 31, 2.7291745322476184),
(155, 'R_endsWithS', -1, 31, 2.6658498367189947),
(156, 'E.', 31, 2, 1.956185562422949),
(157, 'W_a', -1, 2, 4.567599380671607),
(158, 'W_big', -1, 6, 2.9735195102454743),
(159, 'W_change', -1, 11, 3.3193727375317574),
(160, 'W_from', -1, 5, 3.8878088109703994),
(161, 'E.', 5, 6, 2.454738423661261),
(162, 'W_recent', -1, 6, 2.813702206270077),
(163, 'E.', 12, 35, 2.327670018390557),
(164, 'U', -1, 35, 1.4528234424876696),
(165, 'W_when', -1, 35, 4.011265086980163),
(166, 'E.', 35, 11, 2.857525452775004),
(167, 'W_haulage', -1, 11, 4.32506396474813),
(168, 'E.', 11, 27, 1.9580138925595794),
(169, 'W_was', -1, 27, 4.2660057629684),
(170, 'R_endsWithS', -1, 27, 1.7934223574225978),
(171, 'E.', 27, 2, 2.2891724016343495),
(172, 'W_bright', -1, 6, 2.992681149090579),
(173, 'W_spot', -1, 11, 3.220613491106308),
(174, 'E.', 5, 13, 2.6412138657174293),
(175, 'W_u.s.', -1, 13, 3.2079965049318577),
(176, 'R_endsWithDot', -1, 13, 3.2079965049318577),
(177, 'E.', 13, 11, 2.356503203677101),
(178, 'W_productivity', -1, 11, 3.822919829706685),
(179, 'W_helping', -1, 28, 0.694693493722968),
(180, 'W_restrain', -1, 26, 0.8600913604077386),
(181, 'E.', 0, 26, 2.411953958040776),
(182, 'W_make', -1, 26, 3.898590273777646),
(183, 'E.', 26, 13, 2.1195180614626326),
(184, 'W_industry', -1, 11, 3.6043926013837373),
(185, 'E.', 11, 20, 2.609806711358176),
(186, 'W_more', -1, 20, 3.552659383833094),
(187, 'E.', 20, 6, 3.1775432403400985),
(188, 'W_competitive', -1, 6, 2.976622053158026),
(189, 'E.', 6, 19, 1.771180632746488),
(190, 'W_abroad', -1, 19, 3.795196762237916),
(191, 'E.', 19, 43, 2.295613975954753),
(192, 'S.', -1, 39, 2.186455757483502);
INSERT INTO crf_feature_test_new VALUES
(193, <!'W_``'!>, -1, 39, 5.004869688499841);
INSERT INTO crf_feature_test_new VALUES
(194, 'E.', 39, 11, 2.5755893774727556),
(195, 'E.', 11, 31, 1.7721648420817446),
(196, 'W_has', -1, 31, 3.93625538731919),
(197, 'E.', 31, 29, 2.6386288731321472),
(198, 'W_caught', -1, 29, 3.129300187453606),
(199, 'W_up', -1, 5, 4.151560395841534),
(200, 'W_with', -1, 5, 3.675511355861305),
(201, 'W_supply', -1, 11, 2.715998611959373),
(202, 'R_endsWithly', -1, 11, 0.7673422652186473),
(203, 'W_certain', -1, 6, 2.851915076211084),
(204, 'W_types', -1, 12, 1.6258788404774098),
(205, 'E.', 12, 5, 2.6905432557328264),
(206, 'W_transportation', -1, 11, 3.191264085926264),
(207, 'E.', 42, 0, 1.2711473691909434),
(208, 'E.', 0, 12, 1.182207619933404),
(209, 'W_starting', -1, 28, 0.5515350033996826),
(210, 'W_move', -1, 26, 0.8120644141223547),
(211, 'E.', 5, 38, 2.7182674903817587),
(212, 'W_''''', -1, 38, 5.170862953158661),
(213, 'E.', 38, 5, 2.317192606205641),
(214, 'E.', 11, 39, 1.7538803171442363),
(215, 'E.', 39, 19, 3.0627961041121843),
(216, 'W_close', -1, 19, 2.495992180876776),
(217, 'E.', 19, 24, 1.5800294126144967),
(218, 'E.', 24, 0, 2.241039628902612),
(219, 'W_slightly', -1, 19, 1.3492186869598903),
(220, 'R_endsWithly', -1, 19, 2.4413654689445927),
(221, 'E.', 19, 7, 2.9386643499826097),
(222, 'W_more', -1, 7, 4.452742873438579),
(223, 'E.', 7, 5, 2.9936854236944725),
(224, 'W_than', -1, 5, 2.1644820524819646),
(225, 'E.', 42, 38, 1.4996394861187865),
(226, 'E.', 38, 27, 2.7463692256732095),
(227, 'W_said', -1, 27, 4.677340288404336),
(228, 'E.', 27, 13, 1.9336746850852031),
(229, 'W_clifford', -1, 13, 2.0747538313323317),
(230, 'E.', 13, 13, 3.9294546550138674),
(231, 'W_sayre', -1, 13, 3.3385592403455577),
(232, 'E.', 42, 11, 2.2914590080022816),
(233, 'W_director', -1, 11, 3.4531076078614316),
(234, 'E.', 5, 12, 1.054540995246233),
(235, 'W_logistics', -1, 12, 2.9422159343971326),
(236, 'W_du', -1, 13, 2.31602144186341),
(237, 'W_pont', -1, 13, 1.4463820426038703),
(238, 'W_co', -1, 13, 2.5456207401359405),
(239, 'E.', 13, 43, 1.7816633932596668),
(240, 'S.', -1, 12, 1.0387123582845164),
(241, 'E.', 12, 29, 1.1881628310493961),
(242, 'W_surveyed', -1, 29, 0.892458430315305),
(243, 'E.', 29, 19, 1.532676298227432),
(244, 'W_recently', -1, 19, 2.2798691676335316),
(245, 'W_ohio', -1, 13, 2.344559598905777),
(246, 'W_state', -1, 13, 1.3829936762017245),
(247, 'W_university', -1, 13, 2.237132870920883),
(248, 'E.', 13, 27, 1.9577360535758594),
(249, 'E.', 27, 17, 2.2087856989728887),
(250, 'E.', 30, 18, 2.079584829527366),
(251, 'W_their', -1, 18, 4.067554496475091),
(252, 'E.', 18, 6, 3.6290543548238685),
(253, 'W_freight-transport', -1, 6, 3.2568844715837297),
(254, 'R_isDashSeparatedWords', -1, 6, 0.7948432491344457),
(255, 'E.', 6, 42, 1.2615186281064843),
(256, 'W_storage', -1, 11, 2.632730740932247),
(257, 'W_distribution', -1, 11, 3.9157525580557246),
(258, 'W_about', -1, 5, 3.065121356369797),
(259, 'E.', 5, 1, 2.5380914123626104),
(260, 'W_DIGIT', -1, 1, 3.716164825033898),
(261, 'R_isAllCapital', -1, 1, 3.716164825033898),
(262, 'E.', 1, 11, 2.691091194255502),
(263, 'W_%', -1, 11, 4.045579535978065),
(264, 'E.', 11, 2, 1.5517451432551261),
(265, 'S.', -1, 19, 1.0762484384385997),
(266, 'W_only', -1, 19, 2.268536436241348),
(267, 'E.', 19, 1, 1.4108190607695246),
(268, 'E.', 2, 1, 1.6621246460306802),
(269, 'E.', 1, 12, 1.41212953803894),
(270, 'W_polled', -1, 29, 1.647493520976822),
(271, 'E.', 29, 29, 0.7767009450630513),
(272, 'W_expected', -1, 29, 0.665862099618371),
(273, 'E.', 29, 18, 1.8495571863795943),
(274, 'W_decrease', -1, 26, 1.2921324719942597),
(275, 'E.', 26, 42, 1.0963707625581782),
(276, 'W_compared', -1, 29, 0.9464663786685238),
(277, 'E.', 11, 33, 1.9870890250594957),
(278, 'U', -1, 33, 0.9707407033846202),
(279, 'W_who', -1, 33, 4.159314366798417),
(280, 'E.', 33, 27, 3.0290938616080796),
(281, 'W_had', -1, 27, 3.210438877671987),
(282, 'E.', 27, 29, 1.498126968305331),
(283, 'W_looked', -1, 29, 0.8477036764861315),
(284, 'E.', 29, 24, 1.058282178536846),
(285, 'W_freight', -1, 26, 2.6547922325266726),
(286, 'E.', 11, 24, 0.7587012089044496),
(287, 'W_reduce', -1, 26, 1.9870512796567945),
(288, 'E.', 26, 12, 0.6748848167259296),
(289, 'W_past', -1, 6, 2.852378831268221);
analyze crf_feature_test_new;
-- Dictionary table of test data
CREATE TABLE crf_dictionary_new (token text,total integer);
INSERT INTO crf_dictionary_new VALUES
(<!'``'!>, 2);
INSERT INTO crf_dictionary_new VALUES
analyze crf_dictionary_new;
-- Regex table
CREATE TABLE crf_regex_new (pattern text,name text);
INSERT INTO crf_regex_new VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
analyze crf_regex_new;
-- Labels table
CREATE TABLE crf_label_new (id integer,label character varying);
INSERT INTO crf_label_new VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,'''''');
INSERT INTO crf_label_new VALUES
INSERT INTO crf_label_new VALUES
(40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze crf_label_new;
-- Segment table of test data
CREATE TABLE test_segmenttbl_new (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO test_segmenttbl_new VALUES
(0, 1, 'that', 30),
(1, 1, '''s', 30),
(2, 1, 'a', 30),
(3, 1, 'big', 30),
(4, 1, 'change', 30),
(5, 1, 'from', 30),
(6, 1, 'recent', 30),
(7, 1, 'years', 30),
(8, 1, 'when', 30),
(9, 1, 'freight', 30),
(10, 1, 'haulage', 30),
(11, 1, 'was', 30),
(12, 1, 'a', 30),
(13, 1, 'bright', 30),
(14, 1, 'spot', 30),
(15, 1, 'for', 30),
(16, 1, 'u.s.', 30),
(17, 1, 'productivity', 30),
(18, 1, ',', 30),
(19, 1, 'helping', 30),
(20, 1, 'to', 30),
(21, 1, 'restrain', 30),
(22, 1, 'inflation', 30),
(23, 1, 'and', 30),
(24, 1, 'make', 30),
(25, 1, 'u.s.', 30),
(26, 1, 'industry', 30),
(27, 1, 'more', 30),
(28, 1, 'competitive', 30),
(29, 1, 'abroad', 30),
(30, 1, '.', 30);
INSERT INTO test_segmenttbl_new VALUES
(0, 2, <!'``'!>, 40);
INSERT INTO test_segmenttbl_new VALUES
(1, 2, 'this', 40),
(2, 2, 'is', 40),
(3, 2, 'the', 40),
(4, 2, 'first', 40),
(5, 2, 'year', 40),
(6, 2, 'since', 40),
(7, 2, 'transportation', 40),
(8, 2, 'deregulation', 40),
(9, 2, 'in', 40),
(10, 2, '1980', 40),
(11, 2, 'that', 40),
(12, 2, 'we', 40),
(13, 2, 'have', 40),
(14, 2, 'had', 40),
(15, 2, 'such', 40),
(16, 2, 'a', 40),
(17, 2, 'dramatic', 40),
(18, 2, 'and', 40),
(19, 2, 'broad-based', 40),
(20, 2, 'upturn', 40),
(21, 2, 'in', 40),
(22, 2, 'perceived', 40),
(23, 2, 'transportation', 40),
(24, 2, 'rates', 40),
(25, 2, ',', 40),
(26, 2, '''''', 40),
(27, 2, 'said', 40),
(28, 2, 'bernard', 40),
(29, 2, 'lalonde', 40),
(30, 2, ',', 40),
(31, 2, 'a', 40),
(32, 2, 'transportation', 40),
(33, 2, 'logistics', 40),
(34, 2, 'professor', 40),
(35, 2, 'at', 40),
(36, 2, 'ohio', 40),
(37, 2, 'state', 40),
(38, 2, 'in', 40),
(39, 2, 'columbus', 40),
(40, 2, '.', 40),
(0, 3, 'carriers', 16),
(1, 3, 'could', 16),
(2, 3, 'use', 16),
(3, 3, 'their', 16),
(4, 3, 'equipment', 16),
(5, 3, 'more', 16),
(6, 3, 'efficiently', 16),
(7, 3, ',', 16),
(8, 3, 'leading', 16),
(9, 3, 'to', 16),
(10, 3, 'overcapacity', 16),
(11, 3, 'they', 16),
(12, 3, 'were', 16),
(13, 3, 'eager', 16),
(14, 3, 'to', 16),
(15, 3, 'fill', 16),
(16, 3, '.', 16),
(0, 4, 'the', 17),
(1, 4, 'deregulation', 17),
(2, 4, 'of', 17),
(3, 4, 'railroads', 17),
(4, 4, 'and', 17),
(5, 4, 'trucking', 17),
(6, 4, 'companies', 17),
(7, 4, 'that', 17),
(8, 4, 'began', 17),
(9, 4, 'in', 17),
(10, 4, '1980', 17),
(11, 4, 'enabled', 17),
(12, 4, 'shippers', 17),
(13, 4, 'to', 17),
(14, 4, 'bargain', 17),
(15, 4, 'for', 17),
(16, 4, 'transportation', 17),
(17, 4, '.', 17);
analyze test_segmenttbl_new;
-- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl_new','crf_dictionary_new','crf_label_new','crf_regex_new','crf_feature_test_new','viterbi_mtbl_new','viterbi_rtbl_new');
SELECT vcrf_label(
-- Expected viterbi labeling result
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_extraction_new(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction_new VALUES
(1, 0, 'that', 'DT'),
(1, 1, '''s', 'VBZ'),
(1, 2, 'a', 'DT'),
(1, 3, 'big', 'JJ'),
(1, 4, 'change', 'NN'),
(1, 5, 'from', 'IN'),
(1, 6, 'recent', 'JJ'),
(1, 7, 'years', 'NNS'),
(1, 8, 'when', 'WRB'),
(1, 9, 'freight', 'NN'),
(1, 10, 'haulage', 'NN'),
(1, 11, 'was', 'VBD'),
(1, 12, 'a', 'DT'),
(1, 13, 'bright', 'JJ'),
(1, 14, 'spot', 'NN'),
(1, 15, 'for', 'IN'),
(1, 16, 'u.s.', 'NNP'),
(1, 17, 'productivity', 'NN'),
(1, 18, ',', ','),
(1, 19, 'helping', 'VBG'),
(1, 20, 'to', 'TO'),
(1, 21, 'restrain', 'VB'),
(1, 22, 'inflation', 'NN'),
(1, 23, 'and', 'CC'),
(1, 24, 'make', 'VB'),
(1, 25, 'u.s.', 'NNP'),
(1, 26, 'industry', 'NN'),
(1, 27, 'more', 'RBR'),
(1, 28, 'competitive', 'JJ'),
(1, 29, 'abroad', 'RB'),
(1, 30, '.', '.');
INSERT INTO expected_extraction_new VALUES
(2, 0, <!'``'!>, <!'``'!>);
INSERT INTO expected_extraction_new VALUES
(2, 1, 'this', 'DT'),
(2, 2, 'is', 'VBZ'),
(2, 3, 'the', 'DT'),
(2, 4, 'first', 'JJ'),
(2, 5, 'year', 'NN'),
(2, 6, 'since', 'IN'),
(2, 7, 'transportation', 'NN'),
(2, 8, 'deregulation', 'NN'),
(2, 9, 'in', 'IN'),
(2, 10, '1980', 'CD'),
(2, 11, 'that', 'DT'),
(2, 12, 'we', 'JJ'),
(2, 13, 'have', 'JJ'),
(2, 14, 'had', 'NN'),
(2, 15, 'such', 'IN'),
(2, 16, 'a', 'DT'),
(2, 17, 'dramatic', 'NN'),
(2, 18, 'and', 'CC'),
(2, 19, 'broad-based', 'VBN'),
(2, 20, 'upturn', 'RB'),
(2, 21, 'in', 'IN'),
(2, 22, 'perceived', 'VBN'),
(2, 23, 'transportation', 'NN'),
(2, 24, 'rates', 'NNS'),
(2, 25, ',', ','),
(2, 26, '''''', ''''''),
(2, 27, 'said', 'VBD'),
(2, 28, 'bernard', 'NNP'),
(2, 29, 'lalonde', 'NNP'),
(2, 30, ',', ','),
(2, 31, 'a', 'DT'),
(2, 32, 'transportation', 'NN'),
(2, 33, 'logistics', 'NNS'),
(2, 34, 'professor', 'IN'),
(2, 35, 'at', 'IN'),
(2, 36, 'ohio', 'NNP'),
(2, 37, 'state', 'NNP'),
(2, 38, 'in', 'NNP'),
(2, 39, 'columbus', 'NNP'),
(2, 40, '.', '.'),
(3, 0, 'carriers', 'NNS'),
(3, 1, 'could', 'VBP'),
(3, 2, 'use', 'VBN'),
(3, 3, 'their', 'PRP$'),
(3, 4, 'equipment', 'JJ'),
(3, 5, 'more', 'RBR'),
(3, 6, 'efficiently', 'JJ'),
(3, 7, ',', ','),
(3, 8, 'leading', 'VBG'),
(3, 9, 'to', 'TO'),
(3, 10, 'overcapacity', 'VB'),
(3, 11, 'they', 'PRP'),
(3, 12, 'were', 'VBP'),
(3, 13, 'eager', 'VBG'),
(3, 14, 'to', 'TO'),
(3, 15, 'fill', 'VB'),
(3, 16, '.', '.'),
(4, 0, 'the', 'DT'),
(4, 1, 'deregulation', 'NN'),
(4, 2, 'of', 'IN'),
(4, 3, 'railroads', 'NN'),
(4, 4, 'and', 'CC'),
(4, 5, 'trucking', 'NNP'),
(4, 6, 'companies', 'VBD'),
(4, 7, 'that', 'DT'),
(4, 8, 'began', 'NN'),
(4, 9, 'in', 'IN'),
(4, 10, '1980', 'CD'),
(4, 11, 'enabled', 'VBN'),
(4, 12, 'shippers', 'NNS'),
(4, 13, 'to', 'TO'),
(4, 14, 'bargain', 'VB'),
(4, 15, 'for', 'IN'),
(4, 16, 'transportation', 'NN'),
(4, 17, '.', '.');
-- Compare the expected result and the viterbi extraction result. It succeeds
-- only if the two tables are the same.
SELECT assert(s1.count+s2.count = 0, 'Labels predicted do not match expected labels.')
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction_new
SELECT doc_id, start_pos, seg_text, label
FROM extraction_new
) AS U
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM extraction_new
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction_new
) AS U