blob: 66585457baeb52a1236ac1ee1f445382cdca9e4f [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
-- Features table produced by Dr. Sunita's CRF java package (as provided by Kun, "CRF2" package on github) using the training data in the file us50.train.tagged
CREATE TABLE crf_feature_test_new (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test_new VALUES
(0, 'S.', -1, 11, 0.5516753522178934),
(1, 'W_freight', -1, 11, 5.959241076198326),
(2, 'E.', 11, 12, 2.0789747316372034),
(3, 'W_rates', -1, 12, 2.0837653985907174),
(4, 'R_endsWithS', -1, 12, 5.306222451221396),
(5, 'E.', 12, 42, 1.8107793215017256),
(6, 'W_,', -1, 42, 7.286509411020296),
(7, 'E.', 42, 28, 2.222715331802022),
(8, 'U', -1, 28, 0.9492501230997201),
(9, 'W_declining', -1, 28, 0.9644339685939746),
(10, 'R_endsWithIng', -1, 28, 5.3575087838268995),
(11, 'E.', 28, 5, 2.459070565360766),
(12, 'W_for', -1, 5, 4.538367636569374),
(13, 'E.', 5, 21, 2.633416169216998),
(14, 'U', -1, 21, 1.2724051325350452),
(15, 'W_most', -1, 21, 4.742804679577188),
(16, 'E.', 21, 5, 2.3563567868687803),
(17, 'W_of', -1, 5, 5.886369615822257),
(18, 'E.', 5, 2, 3.4072332563788974),
(19, 'W_the', -1, 2, 5.003139924048258),
(20, 'E.', 2, 11, 3.4951824947429704),
(21, 'U', -1, 11, 1.3530610141204105),
(22, 'W_decade', -1, 11, 2.5363441670581475),
(23, 'E.', 11, 5, 3.168556891460435),
(24, 'U', -1, 5, 0.8846547539288385),
(25, 'W_because', -1, 5, 4.371973715457267),
(26, 'E.', 5, 5, 2.3178076749274994),
(27, 'E.', 5, 11, 3.008379674551115),
(28, 'W_competition', -1, 11, 3.7657358279072044),
(29, 'E.', 11, 29, 1.4638353616301887),
(30, 'U', -1, 29, 1.661969150148885),
(31, 'W_spurred', -1, 29, 1.0199935936286502),
(32, 'R_endsWithED', -1, 29, 5.781210636435915),
(33, 'E.', 29, 5, 2.610516071895145),
(34, 'W_by', -1, 5, 4.016756180248158),
(35, 'W_deregulation', -1, 11, 3.107044113256852),
(36, 'E.', 11, 42, 3.0918965630757156),
(37, 'E.', 42, 30, 1.8076032969941829),
(38, 'W_are', -1, 30, 3.4834944891071182),
(39, 'E.', 30, 28, 3.0246636670581872),
(40, 'W_bottoming', -1, 28, 0.9246692126583386),
(41, 'W_out', -1, 5, 4.166232235849933),
(42, 'E.', 5, 42, 1.4287166848500423),
(43, 'W_turning', -1, 28, 0.9453908838237025),
(44, 'E.', 28, 19, 1.9755638903302428),
(45, 'U', -1, 19, 2.276171985449911),
(46, 'W_upward', -1, 19, 3.0573963140454667),
(47, 'E.', 19, 0, 1.7604125260111),
(48, 'W_and', -1, 0, 6.74422233788292),
(49, 'E.', 0, 28, 1.3137638161586382),
(50, 'W_threatening', -1, 28, 1.4170133688772073),
(51, 'E.', 28, 24, 1.6388299340148602),
(52, 'W_to', -1, 24, 5.749953289380646),
(53, 'E.', 24, 26, 6.296991117616081),
(54, 'U', -1, 26, 1.5167524281785714),
(55, 'W_fuel', -1, 26, 0.8669685656391124),
(56, 'E.', 26, 11, 2.7884267413237405),
(57, 'W_inflation', -1, 11, 4.497973086341019),
(58, 'E.', 11, 43, 2.1195755732689774),
(59, 'End.', -1, 43, 4.0877966130773755),
(60, 'W_.', -1, 43, 4.0877966130773755),
(61, 'S.', -1, 13, 1.3657688665015333),
(62, 'U', -1, 13, 2.0185522749436866),
(63, 'W_trucking', -1, 13, 3.1701428979681845),
(64, 'R_endsWithIng', -1, 13, 2.1776484880257367),
(65, 'E.', 13, 42, 1.497373571911848),
(66, 'W_shipping', -1, 28, 1.4634895303741247),
(67, 'E.', 28, 0, 1.3751429994639364),
(68, 'E.', 0, 11, 2.448895430660733),
(69, 'W_air-freight', -1, 11, 3.276197718236269),
(70, 'R_isDashSeparatedWords', -1, 11, 1.390362987986471),
(71, 'U', -1, 12, -0.039795570326036186),
(72, 'W_companies', -1, 12, 1.410903273102038),
(73, 'E.', 12, 30, 2.973781128828628),
(74, 'U', -1, 30, 1.3700945582140844),
(75, 'W_have', -1, 30, 3.0882453767746747),
(76, 'E.', 30, 29, 1.9098632637338704),
(77, 'W_announced', -1, 29, 1.4380819186872527),
(78, 'E.', 29, 11, 0.9060674369789037),
(79, 'W_rate', -1, 11, 5.064883717975049),
(80, 'W_increases', -1, 12, 2.319719757676896),
(81, 'E.', 42, 29, 1.299378678795197),
(82, 'W_scheduled', -1, 29, 0.9387965126997426),
(83, 'W_this', -1, 2, 4.370856586315017),
(84, 'R_endsWithS', -1, 2, 2.076343517584421),
(85, 'W_fall', -1, 11, 2.1032438732239624),
(86, 'E.', 11, 0, 3.030829781200263),
(87, 'W_or', -1, 0, 6.18424407183645),
(88, 'E.', 0, 6, 0.868722661655972),
(89, 'U', -1, 6, 2.23345149856934),
(90, 'W_early', -1, 6, 3.384977523038234),
(91, 'R_endsWithly', -1, 6, 1.5733441079056303),
(92, 'E.', 6, 6, 2.7161792323578373),
(93, 'W_next', -1, 6, 4.2480555895859),
(94, 'E.', 6, 11, 2.9916009076630092),
(95, 'W_year', -1, 11, 3.5174064665274014),
(96, 'W_reflecting', -1, 28, 0.6113402176204787),
(97, 'E.', 28, 7, 2.819730039901505),
(98, 'U', -1, 7, -0.18779864470197277),
(99, 'W_higher', -1, 7, 3.1797755805421333),
(100, 'R_endsWithER', -1, 7, 1.5728998100190603),
(101, 'E.', 7, 12, 2.318262709087868),
(102, 'W_costs', -1, 12, 2.381538004167296),
(103, 'E.', 12, 0, 0.9855356754199541),
(104, 'E.', 0, 27, 1.8595054995497817),
(105, 'U', -1, 27, 1.5446722882388022),
(106, 'W_tightened', -1, 27, 3.8726486164808493),
(107, 'R_endsWithED', -1, 27, 2.5351002298167784),
(108, 'E.', 27, 11, 2.274821834309125),
(109, 'W_demand', -1, 11, 4.064293186476195),
(110, 'E.', 11, 11, 2.2005287976885133),
(111, 'W_transport', -1, 11, 4.716356817414893),
(112, 'S.', -1, 6, 0.6581636527243294),
(113, 'W_major', -1, 6, 2.980552933449864),
(114, 'E.', 6, 12, 3.065321079522356),
(115, 'W_shippers', -1, 12, 2.542917930136408),
(116, 'W_say', -1, 30, 2.2479773095428093),
(117, 'E.', 30, 17, 2.874166869399026),
(118, 'W_they', -1, 17, 3.143575891184453),
(119, 'E.', 17, 30, 4.2749079217250445),
(120, 'W_expect', -1, 30, 3.595622140488586),
(121, 'E.', 30, 11, 1.6611422263789781),
(122, 'E.', 12, 24, 1.1213816226246236),
(123, 'W_rise', -1, 26, 1.5778724980775618),
(124, 'E.', 26, 5, 2.568809140561043),
(125, 'W_at', -1, 5, 3.9554245620023827),
(126, 'E.', 5, 8, 2.6559245091013293),
(127, 'U', -1, 8, 0.9950701292906889),
(128, 'W_least', -1, 8, 4.103392620219655),
(129, 'E.', 8, 19, 3.709110073868926),
(130, 'W_as', -1, 19, 3.1580690701037972),
(131, 'R_endsWithS', -1, 19, 1.0980845299769795),
(132, 'E.', 19, 19, 2.1155578516782),
(133, 'W_fast', -1, 19, 3.951081121254258),
(134, 'E.', 19, 5, 2.364546163338331),
(135, 'W_as', -1, 5, 4.459742427306834),
(136, 'R_endsWithS', -1, 5, 0.27528290655402593),
(137, 'E.', 0, 19, 2.002143530981041),
(138, 'W_maybe', -1, 19, 3.5081392526668402),
(139, 'E.', 19, 20, 2.2175425248487377),
(140, 'U', -1, 20, 0.5355418339873245),
(141, 'W_faster', -1, 20, 3.71210878977271),
(142, 'R_endsWithER', -1, 20, 3.472793820076381),
(143, 'E.', 20, 5, 1.3108157211407567),
(144, 'W_in', -1, 5, 3.840521231387154),
(145, 'E.', 2, 6, 2.8789088935047493),
(146, 'W_few', -1, 6, 2.4522548748428017),
(147, 'W_years', -1, 12, 1.7038384234850026),
(148, 'E.', 12, 43, 1.3730441332544099),
(149, 'S.', -1, 2, 1.926568025628669),
(150, 'U', -1, 2, 0.18622978543338722),
(151, 'W_that', -1, 2, 3.7701267071548883),
(152, 'E.', 2, 31, 3.0418354933422265),
(153, 'U', -1, 31, 1.1440561095338802),
(154, 'W_''s', -1, 31, 2.7291745322476184),
(155, 'R_endsWithS', -1, 31, 2.6658498367189947),
(156, 'E.', 31, 2, 1.956185562422949),
(157, 'W_a', -1, 2, 4.567599380671607),
(158, 'W_big', -1, 6, 2.9735195102454743),
(159, 'W_change', -1, 11, 3.3193727375317574),
(160, 'W_from', -1, 5, 3.8878088109703994),
(161, 'E.', 5, 6, 2.454738423661261),
(162, 'W_recent', -1, 6, 2.813702206270077),
(163, 'E.', 12, 35, 2.327670018390557),
(164, 'U', -1, 35, 1.4528234424876696),
(165, 'W_when', -1, 35, 4.011265086980163),
(166, 'E.', 35, 11, 2.857525452775004),
(167, 'W_haulage', -1, 11, 4.32506396474813),
(168, 'E.', 11, 27, 1.9580138925595794),
(169, 'W_was', -1, 27, 4.2660057629684),
(170, 'R_endsWithS', -1, 27, 1.7934223574225978),
(171, 'E.', 27, 2, 2.2891724016343495),
(172, 'W_bright', -1, 6, 2.992681149090579),
(173, 'W_spot', -1, 11, 3.220613491106308),
(174, 'E.', 5, 13, 2.6412138657174293),
(175, 'W_u.s.', -1, 13, 3.2079965049318577),
(176, 'R_endsWithDot', -1, 13, 3.2079965049318577),
(177, 'E.', 13, 11, 2.356503203677101),
(178, 'W_productivity', -1, 11, 3.822919829706685),
(179, 'W_helping', -1, 28, 0.694693493722968),
(180, 'W_restrain', -1, 26, 0.8600913604077386),
(181, 'E.', 0, 26, 2.411953958040776),
(182, 'W_make', -1, 26, 3.898590273777646),
(183, 'E.', 26, 13, 2.1195180614626326),
(184, 'W_industry', -1, 11, 3.6043926013837373),
(185, 'E.', 11, 20, 2.609806711358176),
(186, 'W_more', -1, 20, 3.552659383833094),
(187, 'E.', 20, 6, 3.1775432403400985),
(188, 'W_competitive', -1, 6, 2.976622053158026),
(189, 'E.', 6, 19, 1.771180632746488),
(190, 'W_abroad', -1, 19, 3.795196762237916),
(191, 'E.', 19, 43, 2.295613975954753),
(192, 'S.', -1, 39, 2.186455757483502);
m4_changequote(<!,!>)
INSERT INTO crf_feature_test_new VALUES
(193, <!'W_``'!>, -1, 39, 5.004869688499841);
m4_changequote(,)
INSERT INTO crf_feature_test_new VALUES
(194, 'E.', 39, 11, 2.5755893774727556),
(195, 'E.', 11, 31, 1.7721648420817446),
(196, 'W_has', -1, 31, 3.93625538731919),
(197, 'E.', 31, 29, 2.6386288731321472),
(198, 'W_caught', -1, 29, 3.129300187453606),
(199, 'W_up', -1, 5, 4.151560395841534),
(200, 'W_with', -1, 5, 3.675511355861305),
(201, 'W_supply', -1, 11, 2.715998611959373),
(202, 'R_endsWithly', -1, 11, 0.7673422652186473),
(203, 'W_certain', -1, 6, 2.851915076211084),
(204, 'W_types', -1, 12, 1.6258788404774098),
(205, 'E.', 12, 5, 2.6905432557328264),
(206, 'W_transportation', -1, 11, 3.191264085926264),
(207, 'E.', 42, 0, 1.2711473691909434),
(208, 'E.', 0, 12, 1.182207619933404),
(209, 'W_starting', -1, 28, 0.5515350033996826),
(210, 'W_move', -1, 26, 0.8120644141223547),
(211, 'E.', 5, 38, 2.7182674903817587),
(212, 'W_''''', -1, 38, 5.170862953158661),
(213, 'E.', 38, 5, 2.317192606205641),
(214, 'E.', 11, 39, 1.7538803171442363),
(215, 'E.', 39, 19, 3.0627961041121843),
(216, 'W_close', -1, 19, 2.495992180876776),
(217, 'E.', 19, 24, 1.5800294126144967),
(218, 'E.', 24, 0, 2.241039628902612),
(219, 'W_slightly', -1, 19, 1.3492186869598903),
(220, 'R_endsWithly', -1, 19, 2.4413654689445927),
(221, 'E.', 19, 7, 2.9386643499826097),
(222, 'W_more', -1, 7, 4.452742873438579),
(223, 'E.', 7, 5, 2.9936854236944725),
(224, 'W_than', -1, 5, 2.1644820524819646),
(225, 'E.', 42, 38, 1.4996394861187865),
(226, 'E.', 38, 27, 2.7463692256732095),
(227, 'W_said', -1, 27, 4.677340288404336),
(228, 'E.', 27, 13, 1.9336746850852031),
(229, 'W_clifford', -1, 13, 2.0747538313323317),
(230, 'E.', 13, 13, 3.9294546550138674),
(231, 'W_sayre', -1, 13, 3.3385592403455577),
(232, 'E.', 42, 11, 2.2914590080022816),
(233, 'W_director', -1, 11, 3.4531076078614316),
(234, 'E.', 5, 12, 1.054540995246233),
(235, 'W_logistics', -1, 12, 2.9422159343971326),
(236, 'W_du', -1, 13, 2.31602144186341),
(237, 'W_pont', -1, 13, 1.4463820426038703),
(238, 'W_co', -1, 13, 2.5456207401359405),
(239, 'E.', 13, 43, 1.7816633932596668),
(240, 'S.', -1, 12, 1.0387123582845164),
(241, 'E.', 12, 29, 1.1881628310493961),
(242, 'W_surveyed', -1, 29, 0.892458430315305),
(243, 'E.', 29, 19, 1.532676298227432),
(244, 'W_recently', -1, 19, 2.2798691676335316),
(245, 'W_ohio', -1, 13, 2.344559598905777),
(246, 'W_state', -1, 13, 1.3829936762017245),
(247, 'W_university', -1, 13, 2.237132870920883),
(248, 'E.', 13, 27, 1.9577360535758594),
(249, 'E.', 27, 17, 2.2087856989728887),
(250, 'E.', 30, 18, 2.079584829527366),
(251, 'W_their', -1, 18, 4.067554496475091),
(252, 'E.', 18, 6, 3.6290543548238685),
(253, 'W_freight-transport', -1, 6, 3.2568844715837297),
(254, 'R_isDashSeparatedWords', -1, 6, 0.7948432491344457),
(255, 'E.', 6, 42, 1.2615186281064843),
(256, 'W_storage', -1, 11, 2.632730740932247),
(257, 'W_distribution', -1, 11, 3.9157525580557246),
(258, 'W_about', -1, 5, 3.065121356369797),
(259, 'E.', 5, 1, 2.5380914123626104),
(260, 'W_DIGIT', -1, 1, 3.716164825033898),
(261, 'R_isAllCapital', -1, 1, 3.716164825033898),
(262, 'E.', 1, 11, 2.691091194255502),
(263, 'W_%', -1, 11, 4.045579535978065),
(264, 'E.', 11, 2, 1.5517451432551261),
(265, 'S.', -1, 19, 1.0762484384385997),
(266, 'W_only', -1, 19, 2.268536436241348),
(267, 'E.', 19, 1, 1.4108190607695246),
(268, 'E.', 2, 1, 1.6621246460306802),
(269, 'E.', 1, 12, 1.41212953803894),
(270, 'W_polled', -1, 29, 1.647493520976822),
(271, 'E.', 29, 29, 0.7767009450630513),
(272, 'W_expected', -1, 29, 0.665862099618371),
(273, 'E.', 29, 18, 1.8495571863795943),
(274, 'W_decrease', -1, 26, 1.2921324719942597),
(275, 'E.', 26, 42, 1.0963707625581782),
(276, 'W_compared', -1, 29, 0.9464663786685238),
(277, 'E.', 11, 33, 1.9870890250594957),
(278, 'U', -1, 33, 0.9707407033846202),
(279, 'W_who', -1, 33, 4.159314366798417),
(280, 'E.', 33, 27, 3.0290938616080796),
(281, 'W_had', -1, 27, 3.210438877671987),
(282, 'E.', 27, 29, 1.498126968305331),
(283, 'W_looked', -1, 29, 0.8477036764861315),
(284, 'E.', 29, 24, 1.058282178536846),
(285, 'W_freight', -1, 26, 2.6547922325266726),
(286, 'E.', 11, 24, 0.7587012089044496),
(287, 'W_reduce', -1, 26, 1.9870512796567945),
(288, 'E.', 26, 12, 0.6748848167259296),
(289, 'W_past', -1, 6, 2.852378831268221);
analyze crf_feature_test_new;
-- Dictionary table of test data
CREATE TABLE crf_dictionary_new (token text,total integer);
m4_changequote(<!,!>)
INSERT INTO crf_dictionary_new VALUES
(<!'``'!>, 2);
m4_changequote(,)
INSERT INTO crf_dictionary_new VALUES
(',',12),
('.',7),
('''''',2),
('%',3),
('a',3),
('about',1),
('abroad',1),
('air-freight',1),
('and',7),
('announced',1),
('are',2),
('as',2),
('at',3),
('because',1),
('big',1),
('bottoming',1),
('bright',1),
('by',2),
('caught',1),
('certain',1),
('change',1),
('clifford',1),
('close',1),
('co',1),
('companies',1),
('compared',1),
('competition',1),
('competitive',1),
('costs',4),
('decade',1),
('declining',1),
('decrease',1),
('demand',2),
('deregulation',1),
('DIGIT',4),
('director',1),
('distribution',1),
('du',1),
('early',1),
('expect',2),
('expected',1),
('fall',1),
('fast',1),
('faster',1),
('few',1),
('for',4),
('freight',6),
('freight-transport',2),
('from',1),
('fuel',1),
('had',1),
('has',1),
('haulage',1),
('have',1),
('helping',1),
('higher',1),
('in',2),
('increases',1),
('industry',1),
('inflation',4),
('least',1),
('logistics',1),
('looked',1),
('major',1),
('make',1),
('maybe',1),
('more',2),
('most',1),
('move',1),
('next',2),
('of',6),
('ohio',1),
('only',1),
('or',2),
('out',1),
('past',1),
('polled',1),
('pont',1),
('productivity',1),
('rate',3),
('rates',3),
('recent',1),
('recently',1),
('reduce',1),
('reflecting',1),
('restrain',1),
('rise',2),
('''s',1),
('said',2),
('say',1),
('sayre',1),
('scheduled',1),
('shippers',3),
('shipping',1),
('slightly',1),
('spot',1),
('spurred',1),
('starting',1),
('state',1),
('storage',1),
('supply',1),
('surveyed',1),
('than',1),
('that',1),
('the',5),
('their',2),
('they',2),
('this',2),
('threatening',1),
('tightened',1),
('to',9),
('transport',2),
('transportation',1),
('trucking',1),
('turning',1),
('types',1),
('university',1),
('up',2),
('upward',1),
('u.s.',2),
('was',1),
('when',1),
('who',1),
('with',2),
('year',2),
('years',3);
analyze crf_dictionary_new;
-- Regex table
CREATE TABLE crf_regex_new (pattern text,name text);
INSERT INTO crf_regex_new VALUES
('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
analyze crf_regex_new;
-- Labels table
CREATE TABLE crf_label_new (id integer,label character varying);
INSERT INTO crf_label_new VALUES
(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,'''''');
m4_changequote(<!,!>)
INSERT INTO crf_label_new VALUES
(39,<!'``'!>);
m4_changequote(,)
INSERT INTO crf_label_new VALUES
(40,'('), (41,')'), (42,','), (43,'.'), (44,':');
analyze crf_label_new;
-- Segment table of test data
CREATE TABLE test_segmenttbl_new (start_pos integer,doc_id integer,seg_text text,max_pos integer);
INSERT INTO test_segmenttbl_new VALUES
(0, 1, 'that', 30),
(1, 1, '''s', 30),
(2, 1, 'a', 30),
(3, 1, 'big', 30),
(4, 1, 'change', 30),
(5, 1, 'from', 30),
(6, 1, 'recent', 30),
(7, 1, 'years', 30),
(8, 1, 'when', 30),
(9, 1, 'freight', 30),
(10, 1, 'haulage', 30),
(11, 1, 'was', 30),
(12, 1, 'a', 30),
(13, 1, 'bright', 30),
(14, 1, 'spot', 30),
(15, 1, 'for', 30),
(16, 1, 'u.s.', 30),
(17, 1, 'productivity', 30),
(18, 1, ',', 30),
(19, 1, 'helping', 30),
(20, 1, 'to', 30),
(21, 1, 'restrain', 30),
(22, 1, 'inflation', 30),
(23, 1, 'and', 30),
(24, 1, 'make', 30),
(25, 1, 'u.s.', 30),
(26, 1, 'industry', 30),
(27, 1, 'more', 30),
(28, 1, 'competitive', 30),
(29, 1, 'abroad', 30),
(30, 1, '.', 30);
m4_changequote(<!,!>)
INSERT INTO test_segmenttbl_new VALUES
(0, 2, <!'``'!>, 40);
m4_changequote(,)
INSERT INTO test_segmenttbl_new VALUES
(1, 2, 'this', 40),
(2, 2, 'is', 40),
(3, 2, 'the', 40),
(4, 2, 'first', 40),
(5, 2, 'year', 40),
(6, 2, 'since', 40),
(7, 2, 'transportation', 40),
(8, 2, 'deregulation', 40),
(9, 2, 'in', 40),
(10, 2, '1980', 40),
(11, 2, 'that', 40),
(12, 2, 'we', 40),
(13, 2, 'have', 40),
(14, 2, 'had', 40),
(15, 2, 'such', 40),
(16, 2, 'a', 40),
(17, 2, 'dramatic', 40),
(18, 2, 'and', 40),
(19, 2, 'broad-based', 40),
(20, 2, 'upturn', 40),
(21, 2, 'in', 40),
(22, 2, 'perceived', 40),
(23, 2, 'transportation', 40),
(24, 2, 'rates', 40),
(25, 2, ',', 40),
(26, 2, '''''', 40),
(27, 2, 'said', 40),
(28, 2, 'bernard', 40),
(29, 2, 'lalonde', 40),
(30, 2, ',', 40),
(31, 2, 'a', 40),
(32, 2, 'transportation', 40),
(33, 2, 'logistics', 40),
(34, 2, 'professor', 40),
(35, 2, 'at', 40),
(36, 2, 'ohio', 40),
(37, 2, 'state', 40),
(38, 2, 'in', 40),
(39, 2, 'columbus', 40),
(40, 2, '.', 40),
(0, 3, 'carriers', 16),
(1, 3, 'could', 16),
(2, 3, 'use', 16),
(3, 3, 'their', 16),
(4, 3, 'equipment', 16),
(5, 3, 'more', 16),
(6, 3, 'efficiently', 16),
(7, 3, ',', 16),
(8, 3, 'leading', 16),
(9, 3, 'to', 16),
(10, 3, 'overcapacity', 16),
(11, 3, 'they', 16),
(12, 3, 'were', 16),
(13, 3, 'eager', 16),
(14, 3, 'to', 16),
(15, 3, 'fill', 16),
(16, 3, '.', 16),
(0, 4, 'the', 17),
(1, 4, 'deregulation', 17),
(2, 4, 'of', 17),
(3, 4, 'railroads', 17),
(4, 4, 'and', 17),
(5, 4, 'trucking', 17),
(6, 4, 'companies', 17),
(7, 4, 'that', 17),
(8, 4, 'began', 17),
(9, 4, 'in', 17),
(10, 4, '1980', 17),
(11, 4, 'enabled', 17),
(12, 4, 'shippers', 17),
(13, 4, 'to', 17),
(14, 4, 'bargain', 17),
(15, 4, 'for', 17),
(16, 4, 'transportation', 17),
(17, 4, '.', 17);
analyze test_segmenttbl_new;
-- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl_new','crf_dictionary_new','crf_label_new','crf_regex_new','crf_feature_test_new','viterbi_mtbl_new','viterbi_rtbl_new');
SELECT vcrf_label(
'test_segmenttbl_new',
'viterbi_mtbl_new',
'viterbi_rtbl_new',
'crf_label_new',
'extraction_new');
-- Expected viterbi labeling result
-- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_extraction_new(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction_new VALUES
(1, 0, 'that', 'DT'),
(1, 1, '''s', 'VBZ'),
(1, 2, 'a', 'DT'),
(1, 3, 'big', 'JJ'),
(1, 4, 'change', 'NN'),
(1, 5, 'from', 'IN'),
(1, 6, 'recent', 'JJ'),
(1, 7, 'years', 'NNS'),
(1, 8, 'when', 'WRB'),
(1, 9, 'freight', 'NN'),
(1, 10, 'haulage', 'NN'),
(1, 11, 'was', 'VBD'),
(1, 12, 'a', 'DT'),
(1, 13, 'bright', 'JJ'),
(1, 14, 'spot', 'NN'),
(1, 15, 'for', 'IN'),
(1, 16, 'u.s.', 'NNP'),
(1, 17, 'productivity', 'NN'),
(1, 18, ',', ','),
(1, 19, 'helping', 'VBG'),
(1, 20, 'to', 'TO'),
(1, 21, 'restrain', 'VB'),
(1, 22, 'inflation', 'NN'),
(1, 23, 'and', 'CC'),
(1, 24, 'make', 'VB'),
(1, 25, 'u.s.', 'NNP'),
(1, 26, 'industry', 'NN'),
(1, 27, 'more', 'RBR'),
(1, 28, 'competitive', 'JJ'),
(1, 29, 'abroad', 'RB'),
(1, 30, '.', '.');
m4_changequote(<!,!>)
INSERT INTO expected_extraction_new VALUES
(2, 0, <!'``'!>, <!'``'!>);
m4_changequote(,)
INSERT INTO expected_extraction_new VALUES
(2, 1, 'this', 'DT'),
(2, 2, 'is', 'VBZ'),
(2, 3, 'the', 'DT'),
(2, 4, 'first', 'JJ'),
(2, 5, 'year', 'NN'),
(2, 6, 'since', 'IN'),
(2, 7, 'transportation', 'NN'),
(2, 8, 'deregulation', 'NN'),
(2, 9, 'in', 'IN'),
(2, 10, '1980', 'CD'),
(2, 11, 'that', 'DT'),
(2, 12, 'we', 'JJ'),
(2, 13, 'have', 'JJ'),
(2, 14, 'had', 'NN'),
(2, 15, 'such', 'IN'),
(2, 16, 'a', 'DT'),
(2, 17, 'dramatic', 'NN'),
(2, 18, 'and', 'CC'),
(2, 19, 'broad-based', 'VBN'),
(2, 20, 'upturn', 'RB'),
(2, 21, 'in', 'IN'),
(2, 22, 'perceived', 'VBN'),
(2, 23, 'transportation', 'NN'),
(2, 24, 'rates', 'NNS'),
(2, 25, ',', ','),
(2, 26, '''''', ''''''),
(2, 27, 'said', 'VBD'),
(2, 28, 'bernard', 'NNP'),
(2, 29, 'lalonde', 'NNP'),
(2, 30, ',', ','),
(2, 31, 'a', 'DT'),
(2, 32, 'transportation', 'NN'),
(2, 33, 'logistics', 'NNS'),
(2, 34, 'professor', 'IN'),
(2, 35, 'at', 'IN'),
(2, 36, 'ohio', 'NNP'),
(2, 37, 'state', 'NNP'),
(2, 38, 'in', 'NNP'),
(2, 39, 'columbus', 'NNP'),
(2, 40, '.', '.'),
(3, 0, 'carriers', 'NNS'),
(3, 1, 'could', 'VBP'),
(3, 2, 'use', 'VBN'),
(3, 3, 'their', 'PRP$'),
(3, 4, 'equipment', 'JJ'),
(3, 5, 'more', 'RBR'),
(3, 6, 'efficiently', 'JJ'),
(3, 7, ',', ','),
(3, 8, 'leading', 'VBG'),
(3, 9, 'to', 'TO'),
(3, 10, 'overcapacity', 'VB'),
(3, 11, 'they', 'PRP'),
(3, 12, 'were', 'VBP'),
(3, 13, 'eager', 'VBG'),
(3, 14, 'to', 'TO'),
(3, 15, 'fill', 'VB'),
(3, 16, '.', '.'),
(4, 0, 'the', 'DT'),
(4, 1, 'deregulation', 'NN'),
(4, 2, 'of', 'IN'),
(4, 3, 'railroads', 'NN'),
(4, 4, 'and', 'CC'),
(4, 5, 'trucking', 'NNP'),
(4, 6, 'companies', 'VBD'),
(4, 7, 'that', 'DT'),
(4, 8, 'began', 'NN'),
(4, 9, 'in', 'IN'),
(4, 10, '1980', 'CD'),
(4, 11, 'enabled', 'VBN'),
(4, 12, 'shippers', 'NNS'),
(4, 13, 'to', 'TO'),
(4, 14, 'bargain', 'VB'),
(4, 15, 'for', 'IN'),
(4, 16, 'transportation', 'NN'),
(4, 17, '.', '.');
-- Compare the expected result and the viterbi extraction result. It succeeds
-- only if the two tables are the same.
SELECT assert(s1.count+s2.count = 0, 'Labels predicted do not match expected labels.')
FROM (
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction_new
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM extraction_new
) AS U
)s1,
(
SELECT count(*) FROM(
SELECT doc_id, start_pos, seg_text, label
FROM extraction_new
EXCEPT ALL
SELECT doc_id, start_pos, seg_text, label
FROM expected_extraction_new
) AS U
)s2;