| --------------------------------------------------------------------------- |
| -- Rules: |
| -- ------ |
| -- 1) Any DB objects should be created w/o schema prefix, |
| -- since this file is executed in a separate schema context. |
| -- 2) There should be no DROP statements in this script, since |
| -- all objects created in the default schema will be cleaned-up outside. |
| --------------------------------------------------------------------------- |
| |
| -- Features table produced by Dr. Sunita's CRF java package (as provided by Kun, "CRF2" package on github) using the training data in the file us50.train.tagged |
| CREATE TABLE crf_feature_test_new (id integer,name text,prev_label_id integer,label_id integer,weight float); |
| INSERT INTO crf_feature_test_new VALUES |
| (0, 'S.', -1, 11, 0.5516753522178934), |
| (1, 'W_freight', -1, 11, 5.959241076198326), |
| (2, 'E.', 11, 12, 2.0789747316372034), |
| (3, 'W_rates', -1, 12, 2.0837653985907174), |
| (4, 'R_endsWithS', -1, 12, 5.306222451221396), |
| (5, 'E.', 12, 42, 1.8107793215017256), |
| (6, 'W_,', -1, 42, 7.286509411020296), |
| (7, 'E.', 42, 28, 2.222715331802022), |
| (8, 'U', -1, 28, 0.9492501230997201), |
| (9, 'W_declining', -1, 28, 0.9644339685939746), |
| (10, 'R_endsWithIng', -1, 28, 5.3575087838268995), |
| (11, 'E.', 28, 5, 2.459070565360766), |
| (12, 'W_for', -1, 5, 4.538367636569374), |
| (13, 'E.', 5, 21, 2.633416169216998), |
| (14, 'U', -1, 21, 1.2724051325350452), |
| (15, 'W_most', -1, 21, 4.742804679577188), |
| (16, 'E.', 21, 5, 2.3563567868687803), |
| (17, 'W_of', -1, 5, 5.886369615822257), |
| (18, 'E.', 5, 2, 3.4072332563788974), |
| (19, 'W_the', -1, 2, 5.003139924048258), |
| (20, 'E.', 2, 11, 3.4951824947429704), |
| (21, 'U', -1, 11, 1.3530610141204105), |
| (22, 'W_decade', -1, 11, 2.5363441670581475), |
| (23, 'E.', 11, 5, 3.168556891460435), |
| (24, 'U', -1, 5, 0.8846547539288385), |
| (25, 'W_because', -1, 5, 4.371973715457267), |
| (26, 'E.', 5, 5, 2.3178076749274994), |
| (27, 'E.', 5, 11, 3.008379674551115), |
| (28, 'W_competition', -1, 11, 3.7657358279072044), |
| (29, 'E.', 11, 29, 1.4638353616301887), |
| (30, 'U', -1, 29, 1.661969150148885), |
| (31, 'W_spurred', -1, 29, 1.0199935936286502), |
| (32, 'R_endsWithED', -1, 29, 5.781210636435915), |
| (33, 'E.', 29, 5, 2.610516071895145), |
| (34, 'W_by', -1, 5, 4.016756180248158), |
| (35, 'W_deregulation', -1, 11, 3.107044113256852), |
| (36, 'E.', 11, 42, 3.0918965630757156), |
| (37, 'E.', 42, 30, 1.8076032969941829), |
| (38, 'W_are', -1, 30, 3.4834944891071182), |
| (39, 'E.', 30, 28, 3.0246636670581872), |
| (40, 'W_bottoming', -1, 28, 0.9246692126583386), |
| (41, 'W_out', -1, 5, 4.166232235849933), |
| (42, 'E.', 5, 42, 1.4287166848500423), |
| (43, 'W_turning', -1, 28, 0.9453908838237025), |
| (44, 'E.', 28, 19, 1.9755638903302428), |
| (45, 'U', -1, 19, 2.276171985449911), |
| (46, 'W_upward', -1, 19, 3.0573963140454667), |
| (47, 'E.', 19, 0, 1.7604125260111), |
| (48, 'W_and', -1, 0, 6.74422233788292), |
| (49, 'E.', 0, 28, 1.3137638161586382), |
| (50, 'W_threatening', -1, 28, 1.4170133688772073), |
| (51, 'E.', 28, 24, 1.6388299340148602), |
| (52, 'W_to', -1, 24, 5.749953289380646), |
| (53, 'E.', 24, 26, 6.296991117616081), |
| (54, 'U', -1, 26, 1.5167524281785714), |
| (55, 'W_fuel', -1, 26, 0.8669685656391124), |
| (56, 'E.', 26, 11, 2.7884267413237405), |
| (57, 'W_inflation', -1, 11, 4.497973086341019), |
| (58, 'E.', 11, 43, 2.1195755732689774), |
| (59, 'End.', -1, 43, 4.0877966130773755), |
| (60, 'W_.', -1, 43, 4.0877966130773755), |
| (61, 'S.', -1, 13, 1.3657688665015333), |
| (62, 'U', -1, 13, 2.0185522749436866), |
| (63, 'W_trucking', -1, 13, 3.1701428979681845), |
| (64, 'R_endsWithIng', -1, 13, 2.1776484880257367), |
| (65, 'E.', 13, 42, 1.497373571911848), |
| (66, 'W_shipping', -1, 28, 1.4634895303741247), |
| (67, 'E.', 28, 0, 1.3751429994639364), |
| (68, 'E.', 0, 11, 2.448895430660733), |
| (69, 'W_air-freight', -1, 11, 3.276197718236269), |
| (70, 'R_isDashSeparatedWords', -1, 11, 1.390362987986471), |
| (71, 'U', -1, 12, -0.039795570326036186), |
| (72, 'W_companies', -1, 12, 1.410903273102038), |
| (73, 'E.', 12, 30, 2.973781128828628), |
| (74, 'U', -1, 30, 1.3700945582140844), |
| (75, 'W_have', -1, 30, 3.0882453767746747), |
| (76, 'E.', 30, 29, 1.9098632637338704), |
| (77, 'W_announced', -1, 29, 1.4380819186872527), |
| (78, 'E.', 29, 11, 0.9060674369789037), |
| (79, 'W_rate', -1, 11, 5.064883717975049), |
| (80, 'W_increases', -1, 12, 2.319719757676896), |
| (81, 'E.', 42, 29, 1.299378678795197), |
| (82, 'W_scheduled', -1, 29, 0.9387965126997426), |
| (83, 'W_this', -1, 2, 4.370856586315017), |
| (84, 'R_endsWithS', -1, 2, 2.076343517584421), |
| (85, 'W_fall', -1, 11, 2.1032438732239624), |
| (86, 'E.', 11, 0, 3.030829781200263), |
| (87, 'W_or', -1, 0, 6.18424407183645), |
| (88, 'E.', 0, 6, 0.868722661655972), |
| (89, 'U', -1, 6, 2.23345149856934), |
| (90, 'W_early', -1, 6, 3.384977523038234), |
| (91, 'R_endsWithly', -1, 6, 1.5733441079056303), |
| (92, 'E.', 6, 6, 2.7161792323578373), |
| (93, 'W_next', -1, 6, 4.2480555895859), |
| (94, 'E.', 6, 11, 2.9916009076630092), |
| (95, 'W_year', -1, 11, 3.5174064665274014), |
| (96, 'W_reflecting', -1, 28, 0.6113402176204787), |
| (97, 'E.', 28, 7, 2.819730039901505), |
| (98, 'U', -1, 7, -0.18779864470197277), |
| (99, 'W_higher', -1, 7, 3.1797755805421333), |
| (100, 'R_endsWithER', -1, 7, 1.5728998100190603), |
| (101, 'E.', 7, 12, 2.318262709087868), |
| (102, 'W_costs', -1, 12, 2.381538004167296), |
| (103, 'E.', 12, 0, 0.9855356754199541), |
| (104, 'E.', 0, 27, 1.8595054995497817), |
| (105, 'U', -1, 27, 1.5446722882388022), |
| (106, 'W_tightened', -1, 27, 3.8726486164808493), |
| (107, 'R_endsWithED', -1, 27, 2.5351002298167784), |
| (108, 'E.', 27, 11, 2.274821834309125), |
| (109, 'W_demand', -1, 11, 4.064293186476195), |
| (110, 'E.', 11, 11, 2.2005287976885133), |
| (111, 'W_transport', -1, 11, 4.716356817414893), |
| (112, 'S.', -1, 6, 0.6581636527243294), |
| (113, 'W_major', -1, 6, 2.980552933449864), |
| (114, 'E.', 6, 12, 3.065321079522356), |
| (115, 'W_shippers', -1, 12, 2.542917930136408), |
| (116, 'W_say', -1, 30, 2.2479773095428093), |
| (117, 'E.', 30, 17, 2.874166869399026), |
| (118, 'W_they', -1, 17, 3.143575891184453), |
| (119, 'E.', 17, 30, 4.2749079217250445), |
| (120, 'W_expect', -1, 30, 3.595622140488586), |
| (121, 'E.', 30, 11, 1.6611422263789781), |
| (122, 'E.', 12, 24, 1.1213816226246236), |
| (123, 'W_rise', -1, 26, 1.5778724980775618), |
| (124, 'E.', 26, 5, 2.568809140561043), |
| (125, 'W_at', -1, 5, 3.9554245620023827), |
| (126, 'E.', 5, 8, 2.6559245091013293), |
| (127, 'U', -1, 8, 0.9950701292906889), |
| (128, 'W_least', -1, 8, 4.103392620219655), |
| (129, 'E.', 8, 19, 3.709110073868926), |
| (130, 'W_as', -1, 19, 3.1580690701037972), |
| (131, 'R_endsWithS', -1, 19, 1.0980845299769795), |
| (132, 'E.', 19, 19, 2.1155578516782), |
| (133, 'W_fast', -1, 19, 3.951081121254258), |
| (134, 'E.', 19, 5, 2.364546163338331), |
| (135, 'W_as', -1, 5, 4.459742427306834), |
| (136, 'R_endsWithS', -1, 5, 0.27528290655402593), |
| (137, 'E.', 0, 19, 2.002143530981041), |
| (138, 'W_maybe', -1, 19, 3.5081392526668402), |
| (139, 'E.', 19, 20, 2.2175425248487377), |
| (140, 'U', -1, 20, 0.5355418339873245), |
| (141, 'W_faster', -1, 20, 3.71210878977271), |
| (142, 'R_endsWithER', -1, 20, 3.472793820076381), |
| (143, 'E.', 20, 5, 1.3108157211407567), |
| (144, 'W_in', -1, 5, 3.840521231387154), |
| (145, 'E.', 2, 6, 2.8789088935047493), |
| (146, 'W_few', -1, 6, 2.4522548748428017), |
| (147, 'W_years', -1, 12, 1.7038384234850026), |
| (148, 'E.', 12, 43, 1.3730441332544099), |
| (149, 'S.', -1, 2, 1.926568025628669), |
| (150, 'U', -1, 2, 0.18622978543338722), |
| (151, 'W_that', -1, 2, 3.7701267071548883), |
| (152, 'E.', 2, 31, 3.0418354933422265), |
| (153, 'U', -1, 31, 1.1440561095338802), |
| (154, 'W_''s', -1, 31, 2.7291745322476184), |
| (155, 'R_endsWithS', -1, 31, 2.6658498367189947), |
| (156, 'E.', 31, 2, 1.956185562422949), |
| (157, 'W_a', -1, 2, 4.567599380671607), |
| (158, 'W_big', -1, 6, 2.9735195102454743), |
| (159, 'W_change', -1, 11, 3.3193727375317574), |
| (160, 'W_from', -1, 5, 3.8878088109703994), |
| (161, 'E.', 5, 6, 2.454738423661261), |
| (162, 'W_recent', -1, 6, 2.813702206270077), |
| (163, 'E.', 12, 35, 2.327670018390557), |
| (164, 'U', -1, 35, 1.4528234424876696), |
| (165, 'W_when', -1, 35, 4.011265086980163), |
| (166, 'E.', 35, 11, 2.857525452775004), |
| (167, 'W_haulage', -1, 11, 4.32506396474813), |
| (168, 'E.', 11, 27, 1.9580138925595794), |
| (169, 'W_was', -1, 27, 4.2660057629684), |
| (170, 'R_endsWithS', -1, 27, 1.7934223574225978), |
| (171, 'E.', 27, 2, 2.2891724016343495), |
| (172, 'W_bright', -1, 6, 2.992681149090579), |
| (173, 'W_spot', -1, 11, 3.220613491106308), |
| (174, 'E.', 5, 13, 2.6412138657174293), |
| (175, 'W_u.s.', -1, 13, 3.2079965049318577), |
| (176, 'R_endsWithDot', -1, 13, 3.2079965049318577), |
| (177, 'E.', 13, 11, 2.356503203677101), |
| (178, 'W_productivity', -1, 11, 3.822919829706685), |
| (179, 'W_helping', -1, 28, 0.694693493722968), |
| (180, 'W_restrain', -1, 26, 0.8600913604077386), |
| (181, 'E.', 0, 26, 2.411953958040776), |
| (182, 'W_make', -1, 26, 3.898590273777646), |
| (183, 'E.', 26, 13, 2.1195180614626326), |
| (184, 'W_industry', -1, 11, 3.6043926013837373), |
| (185, 'E.', 11, 20, 2.609806711358176), |
| (186, 'W_more', -1, 20, 3.552659383833094), |
| (187, 'E.', 20, 6, 3.1775432403400985), |
| (188, 'W_competitive', -1, 6, 2.976622053158026), |
| (189, 'E.', 6, 19, 1.771180632746488), |
| (190, 'W_abroad', -1, 19, 3.795196762237916), |
| (191, 'E.', 19, 43, 2.295613975954753), |
| (192, 'S.', -1, 39, 2.186455757483502); |
| m4_changequote(<!,!>) |
| INSERT INTO crf_feature_test_new VALUES |
| (193, <!'W_``'!>, -1, 39, 5.004869688499841); |
| m4_changequote(,) |
| INSERT INTO crf_feature_test_new VALUES |
| (194, 'E.', 39, 11, 2.5755893774727556), |
| (195, 'E.', 11, 31, 1.7721648420817446), |
| (196, 'W_has', -1, 31, 3.93625538731919), |
| (197, 'E.', 31, 29, 2.6386288731321472), |
| (198, 'W_caught', -1, 29, 3.129300187453606), |
| (199, 'W_up', -1, 5, 4.151560395841534), |
| (200, 'W_with', -1, 5, 3.675511355861305), |
| (201, 'W_supply', -1, 11, 2.715998611959373), |
| (202, 'R_endsWithly', -1, 11, 0.7673422652186473), |
| (203, 'W_certain', -1, 6, 2.851915076211084), |
| (204, 'W_types', -1, 12, 1.6258788404774098), |
| (205, 'E.', 12, 5, 2.6905432557328264), |
| (206, 'W_transportation', -1, 11, 3.191264085926264), |
| (207, 'E.', 42, 0, 1.2711473691909434), |
| (208, 'E.', 0, 12, 1.182207619933404), |
| (209, 'W_starting', -1, 28, 0.5515350033996826), |
| (210, 'W_move', -1, 26, 0.8120644141223547), |
| (211, 'E.', 5, 38, 2.7182674903817587), |
| (212, 'W_''''', -1, 38, 5.170862953158661), |
| (213, 'E.', 38, 5, 2.317192606205641), |
| (214, 'E.', 11, 39, 1.7538803171442363), |
| (215, 'E.', 39, 19, 3.0627961041121843), |
| (216, 'W_close', -1, 19, 2.495992180876776), |
| (217, 'E.', 19, 24, 1.5800294126144967), |
| (218, 'E.', 24, 0, 2.241039628902612), |
| (219, 'W_slightly', -1, 19, 1.3492186869598903), |
| (220, 'R_endsWithly', -1, 19, 2.4413654689445927), |
| (221, 'E.', 19, 7, 2.9386643499826097), |
| (222, 'W_more', -1, 7, 4.452742873438579), |
| (223, 'E.', 7, 5, 2.9936854236944725), |
| (224, 'W_than', -1, 5, 2.1644820524819646), |
| (225, 'E.', 42, 38, 1.4996394861187865), |
| (226, 'E.', 38, 27, 2.7463692256732095), |
| (227, 'W_said', -1, 27, 4.677340288404336), |
| (228, 'E.', 27, 13, 1.9336746850852031), |
| (229, 'W_clifford', -1, 13, 2.0747538313323317), |
| (230, 'E.', 13, 13, 3.9294546550138674), |
| (231, 'W_sayre', -1, 13, 3.3385592403455577), |
| (232, 'E.', 42, 11, 2.2914590080022816), |
| (233, 'W_director', -1, 11, 3.4531076078614316), |
| (234, 'E.', 5, 12, 1.054540995246233), |
| (235, 'W_logistics', -1, 12, 2.9422159343971326), |
| (236, 'W_du', -1, 13, 2.31602144186341), |
| (237, 'W_pont', -1, 13, 1.4463820426038703), |
| (238, 'W_co', -1, 13, 2.5456207401359405), |
| (239, 'E.', 13, 43, 1.7816633932596668), |
| (240, 'S.', -1, 12, 1.0387123582845164), |
| (241, 'E.', 12, 29, 1.1881628310493961), |
| (242, 'W_surveyed', -1, 29, 0.892458430315305), |
| (243, 'E.', 29, 19, 1.532676298227432), |
| (244, 'W_recently', -1, 19, 2.2798691676335316), |
| (245, 'W_ohio', -1, 13, 2.344559598905777), |
| (246, 'W_state', -1, 13, 1.3829936762017245), |
| (247, 'W_university', -1, 13, 2.237132870920883), |
| (248, 'E.', 13, 27, 1.9577360535758594), |
| (249, 'E.', 27, 17, 2.2087856989728887), |
| (250, 'E.', 30, 18, 2.079584829527366), |
| (251, 'W_their', -1, 18, 4.067554496475091), |
| (252, 'E.', 18, 6, 3.6290543548238685), |
| (253, 'W_freight-transport', -1, 6, 3.2568844715837297), |
| (254, 'R_isDashSeparatedWords', -1, 6, 0.7948432491344457), |
| (255, 'E.', 6, 42, 1.2615186281064843), |
| (256, 'W_storage', -1, 11, 2.632730740932247), |
| (257, 'W_distribution', -1, 11, 3.9157525580557246), |
| (258, 'W_about', -1, 5, 3.065121356369797), |
| (259, 'E.', 5, 1, 2.5380914123626104), |
| (260, 'W_DIGIT', -1, 1, 3.716164825033898), |
| (261, 'R_isAllCapital', -1, 1, 3.716164825033898), |
| (262, 'E.', 1, 11, 2.691091194255502), |
| (263, 'W_%', -1, 11, 4.045579535978065), |
| (264, 'E.', 11, 2, 1.5517451432551261), |
| (265, 'S.', -1, 19, 1.0762484384385997), |
| (266, 'W_only', -1, 19, 2.268536436241348), |
| (267, 'E.', 19, 1, 1.4108190607695246), |
| (268, 'E.', 2, 1, 1.6621246460306802), |
| (269, 'E.', 1, 12, 1.41212953803894), |
| (270, 'W_polled', -1, 29, 1.647493520976822), |
| (271, 'E.', 29, 29, 0.7767009450630513), |
| (272, 'W_expected', -1, 29, 0.665862099618371), |
| (273, 'E.', 29, 18, 1.8495571863795943), |
| (274, 'W_decrease', -1, 26, 1.2921324719942597), |
| (275, 'E.', 26, 42, 1.0963707625581782), |
| (276, 'W_compared', -1, 29, 0.9464663786685238), |
| (277, 'E.', 11, 33, 1.9870890250594957), |
| (278, 'U', -1, 33, 0.9707407033846202), |
| (279, 'W_who', -1, 33, 4.159314366798417), |
| (280, 'E.', 33, 27, 3.0290938616080796), |
| (281, 'W_had', -1, 27, 3.210438877671987), |
| (282, 'E.', 27, 29, 1.498126968305331), |
| (283, 'W_looked', -1, 29, 0.8477036764861315), |
| (284, 'E.', 29, 24, 1.058282178536846), |
| (285, 'W_freight', -1, 26, 2.6547922325266726), |
| (286, 'E.', 11, 24, 0.7587012089044496), |
| (287, 'W_reduce', -1, 26, 1.9870512796567945), |
| (288, 'E.', 26, 12, 0.6748848167259296), |
| (289, 'W_past', -1, 6, 2.852378831268221); |
| analyze crf_feature_test_new; |
| |
| -- Dictionary table of test data |
| CREATE TABLE crf_dictionary_new (token text,total integer); |
| m4_changequote(<!,!>) |
| INSERT INTO crf_dictionary_new VALUES |
| (<!'``'!>, 2); |
| m4_changequote(,) |
| INSERT INTO crf_dictionary_new VALUES |
| (',',12), |
| ('.',7), |
| ('''''',2), |
| ('%',3), |
| ('a',3), |
| ('about',1), |
| ('abroad',1), |
| ('air-freight',1), |
| ('and',7), |
| ('announced',1), |
| ('are',2), |
| ('as',2), |
| ('at',3), |
| ('because',1), |
| ('big',1), |
| ('bottoming',1), |
| ('bright',1), |
| ('by',2), |
| ('caught',1), |
| ('certain',1), |
| ('change',1), |
| ('clifford',1), |
| ('close',1), |
| ('co',1), |
| ('companies',1), |
| ('compared',1), |
| ('competition',1), |
| ('competitive',1), |
| ('costs',4), |
| ('decade',1), |
| ('declining',1), |
| ('decrease',1), |
| ('demand',2), |
| ('deregulation',1), |
| ('DIGIT',4), |
| ('director',1), |
| ('distribution',1), |
| ('du',1), |
| ('early',1), |
| ('expect',2), |
| ('expected',1), |
| ('fall',1), |
| ('fast',1), |
| ('faster',1), |
| ('few',1), |
| ('for',4), |
| ('freight',6), |
| ('freight-transport',2), |
| ('from',1), |
| ('fuel',1), |
| ('had',1), |
| ('has',1), |
| ('haulage',1), |
| ('have',1), |
| ('helping',1), |
| ('higher',1), |
| ('in',2), |
| ('increases',1), |
| ('industry',1), |
| ('inflation',4), |
| ('least',1), |
| ('logistics',1), |
| ('looked',1), |
| ('major',1), |
| ('make',1), |
| ('maybe',1), |
| ('more',2), |
| ('most',1), |
| ('move',1), |
| ('next',2), |
| ('of',6), |
| ('ohio',1), |
| ('only',1), |
| ('or',2), |
| ('out',1), |
| ('past',1), |
| ('polled',1), |
| ('pont',1), |
| ('productivity',1), |
| ('rate',3), |
| ('rates',3), |
| ('recent',1), |
| ('recently',1), |
| ('reduce',1), |
| ('reflecting',1), |
| ('restrain',1), |
| ('rise',2), |
| ('''s',1), |
| ('said',2), |
| ('say',1), |
| ('sayre',1), |
| ('scheduled',1), |
| ('shippers',3), |
| ('shipping',1), |
| ('slightly',1), |
| ('spot',1), |
| ('spurred',1), |
| ('starting',1), |
| ('state',1), |
| ('storage',1), |
| ('supply',1), |
| ('surveyed',1), |
| ('than',1), |
| ('that',1), |
| ('the',5), |
| ('their',2), |
| ('they',2), |
| ('this',2), |
| ('threatening',1), |
| ('tightened',1), |
| ('to',9), |
| ('transport',2), |
| ('transportation',1), |
| ('trucking',1), |
| ('turning',1), |
| ('types',1), |
| ('university',1), |
| ('up',2), |
| ('upward',1), |
| ('u.s.',2), |
| ('was',1), |
| ('when',1), |
| ('who',1), |
| ('with',2), |
| ('year',2), |
| ('years',3); |
| |
| |
| analyze crf_dictionary_new; |
| |
| -- Regex table |
| CREATE TABLE crf_regex_new (pattern text,name text); |
| INSERT INTO crf_regex_new VALUES |
| ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'), |
| ('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'), |
| ('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'), |
| ('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'), |
| ('^.*@.*$','isEmailId'); |
| analyze crf_regex_new; |
| |
| -- Labels table |
| CREATE TABLE crf_label_new (id integer,label character varying); |
| INSERT INTO crf_label_new VALUES |
| (0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'), |
| (9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'), |
| (18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'), |
| (27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'), |
| (36,'$'), (37,'#'), (38,''''''); |
| m4_changequote(<!,!>) |
| INSERT INTO crf_label_new VALUES |
| (39,<!'``'!>); |
| m4_changequote(,) |
| INSERT INTO crf_label_new VALUES |
| (40,'('), (41,')'), (42,','), (43,'.'), (44,':'); |
| analyze crf_label_new; |
| |
| -- Segment table of test data |
| CREATE TABLE test_segmenttbl_new (start_pos integer,doc_id integer,seg_text text,max_pos integer); |
| INSERT INTO test_segmenttbl_new VALUES |
| (0, 1, 'that', 30), |
| (1, 1, '''s', 30), |
| (2, 1, 'a', 30), |
| (3, 1, 'big', 30), |
| (4, 1, 'change', 30), |
| (5, 1, 'from', 30), |
| (6, 1, 'recent', 30), |
| (7, 1, 'years', 30), |
| (8, 1, 'when', 30), |
| (9, 1, 'freight', 30), |
| (10, 1, 'haulage', 30), |
| (11, 1, 'was', 30), |
| (12, 1, 'a', 30), |
| (13, 1, 'bright', 30), |
| (14, 1, 'spot', 30), |
| (15, 1, 'for', 30), |
| (16, 1, 'u.s.', 30), |
| (17, 1, 'productivity', 30), |
| (18, 1, ',', 30), |
| (19, 1, 'helping', 30), |
| (20, 1, 'to', 30), |
| (21, 1, 'restrain', 30), |
| (22, 1, 'inflation', 30), |
| (23, 1, 'and', 30), |
| (24, 1, 'make', 30), |
| (25, 1, 'u.s.', 30), |
| (26, 1, 'industry', 30), |
| (27, 1, 'more', 30), |
| (28, 1, 'competitive', 30), |
| (29, 1, 'abroad', 30), |
| (30, 1, '.', 30); |
| m4_changequote(<!,!>) |
| INSERT INTO test_segmenttbl_new VALUES |
| (0, 2, <!'``'!>, 40); |
| m4_changequote(,) |
| INSERT INTO test_segmenttbl_new VALUES |
| (1, 2, 'this', 40), |
| (2, 2, 'is', 40), |
| (3, 2, 'the', 40), |
| (4, 2, 'first', 40), |
| (5, 2, 'year', 40), |
| (6, 2, 'since', 40), |
| (7, 2, 'transportation', 40), |
| (8, 2, 'deregulation', 40), |
| (9, 2, 'in', 40), |
| (10, 2, '1980', 40), |
| (11, 2, 'that', 40), |
| (12, 2, 'we', 40), |
| (13, 2, 'have', 40), |
| (14, 2, 'had', 40), |
| (15, 2, 'such', 40), |
| (16, 2, 'a', 40), |
| (17, 2, 'dramatic', 40), |
| (18, 2, 'and', 40), |
| (19, 2, 'broad-based', 40), |
| (20, 2, 'upturn', 40), |
| (21, 2, 'in', 40), |
| (22, 2, 'perceived', 40), |
| (23, 2, 'transportation', 40), |
| (24, 2, 'rates', 40), |
| (25, 2, ',', 40), |
| (26, 2, '''''', 40), |
| (27, 2, 'said', 40), |
| (28, 2, 'bernard', 40), |
| (29, 2, 'lalonde', 40), |
| (30, 2, ',', 40), |
| (31, 2, 'a', 40), |
| (32, 2, 'transportation', 40), |
| (33, 2, 'logistics', 40), |
| (34, 2, 'professor', 40), |
| (35, 2, 'at', 40), |
| (36, 2, 'ohio', 40), |
| (37, 2, 'state', 40), |
| (38, 2, 'in', 40), |
| (39, 2, 'columbus', 40), |
| (40, 2, '.', 40), |
| (0, 3, 'carriers', 16), |
| (1, 3, 'could', 16), |
| (2, 3, 'use', 16), |
| (3, 3, 'their', 16), |
| (4, 3, 'equipment', 16), |
| (5, 3, 'more', 16), |
| (6, 3, 'efficiently', 16), |
| (7, 3, ',', 16), |
| (8, 3, 'leading', 16), |
| (9, 3, 'to', 16), |
| (10, 3, 'overcapacity', 16), |
| (11, 3, 'they', 16), |
| (12, 3, 'were', 16), |
| (13, 3, 'eager', 16), |
| (14, 3, 'to', 16), |
| (15, 3, 'fill', 16), |
| (16, 3, '.', 16), |
| (0, 4, 'the', 17), |
| (1, 4, 'deregulation', 17), |
| (2, 4, 'of', 17), |
| (3, 4, 'railroads', 17), |
| (4, 4, 'and', 17), |
| (5, 4, 'trucking', 17), |
| (6, 4, 'companies', 17), |
| (7, 4, 'that', 17), |
| (8, 4, 'began', 17), |
| (9, 4, 'in', 17), |
| (10, 4, '1980', 17), |
| (11, 4, 'enabled', 17), |
| (12, 4, 'shippers', 17), |
| (13, 4, 'to', 17), |
| (14, 4, 'bargain', 17), |
| (15, 4, 'for', 17), |
| (16, 4, 'transportation', 17), |
| (17, 4, '.', 17); |
| analyze test_segmenttbl_new; |
| |
| -- extract features for tokens stored in segmenttbl |
| SELECT crf_test_fgen('test_segmenttbl_new','crf_dictionary_new','crf_label_new','crf_regex_new','crf_feature_test_new','viterbi_mtbl_new','viterbi_rtbl_new'); |
| |
| |
| SELECT vcrf_label( |
| 'test_segmenttbl_new', |
| 'viterbi_mtbl_new', |
| 'viterbi_rtbl_new', |
| 'crf_label_new', |
| 'extraction_new'); |
| |
| -- Expected viterbi labeling result |
| -- The result is produced from Dr. Sunita's CRF java package with the same input |
| CREATE TABLE expected_extraction_new(doc_id integer, start_pos integer, seg_text text, label character varying); |
| INSERT INTO expected_extraction_new VALUES |
| (1, 0, 'that', 'DT'), |
| (1, 1, '''s', 'VBZ'), |
| (1, 2, 'a', 'DT'), |
| (1, 3, 'big', 'JJ'), |
| (1, 4, 'change', 'NN'), |
| (1, 5, 'from', 'IN'), |
| (1, 6, 'recent', 'JJ'), |
| (1, 7, 'years', 'NNS'), |
| (1, 8, 'when', 'WRB'), |
| (1, 9, 'freight', 'NN'), |
| (1, 10, 'haulage', 'NN'), |
| (1, 11, 'was', 'VBD'), |
| (1, 12, 'a', 'DT'), |
| (1, 13, 'bright', 'JJ'), |
| (1, 14, 'spot', 'NN'), |
| (1, 15, 'for', 'IN'), |
| (1, 16, 'u.s.', 'NNP'), |
| (1, 17, 'productivity', 'NN'), |
| (1, 18, ',', ','), |
| (1, 19, 'helping', 'VBG'), |
| (1, 20, 'to', 'TO'), |
| (1, 21, 'restrain', 'VB'), |
| (1, 22, 'inflation', 'NN'), |
| (1, 23, 'and', 'CC'), |
| (1, 24, 'make', 'VB'), |
| (1, 25, 'u.s.', 'NNP'), |
| (1, 26, 'industry', 'NN'), |
| (1, 27, 'more', 'RBR'), |
| (1, 28, 'competitive', 'JJ'), |
| (1, 29, 'abroad', 'RB'), |
| (1, 30, '.', '.'); |
| m4_changequote(<!,!>) |
| INSERT INTO expected_extraction_new VALUES |
| (2, 0, <!'``'!>, <!'``'!>); |
| m4_changequote(,) |
| INSERT INTO expected_extraction_new VALUES |
| (2, 1, 'this', 'DT'), |
| (2, 2, 'is', 'VBZ'), |
| (2, 3, 'the', 'DT'), |
| (2, 4, 'first', 'JJ'), |
| (2, 5, 'year', 'NN'), |
| (2, 6, 'since', 'IN'), |
| (2, 7, 'transportation', 'NN'), |
| (2, 8, 'deregulation', 'NN'), |
| (2, 9, 'in', 'IN'), |
| (2, 10, '1980', 'CD'), |
| (2, 11, 'that', 'DT'), |
| (2, 12, 'we', 'JJ'), |
| (2, 13, 'have', 'JJ'), |
| (2, 14, 'had', 'NN'), |
| (2, 15, 'such', 'IN'), |
| (2, 16, 'a', 'DT'), |
| (2, 17, 'dramatic', 'NN'), |
| (2, 18, 'and', 'CC'), |
| (2, 19, 'broad-based', 'VBN'), |
| (2, 20, 'upturn', 'RB'), |
| (2, 21, 'in', 'IN'), |
| (2, 22, 'perceived', 'VBN'), |
| (2, 23, 'transportation', 'NN'), |
| (2, 24, 'rates', 'NNS'), |
| (2, 25, ',', ','), |
| (2, 26, '''''', ''''''), |
| (2, 27, 'said', 'VBD'), |
| (2, 28, 'bernard', 'NNP'), |
| (2, 29, 'lalonde', 'NNP'), |
| (2, 30, ',', ','), |
| (2, 31, 'a', 'DT'), |
| (2, 32, 'transportation', 'NN'), |
| (2, 33, 'logistics', 'NNS'), |
| (2, 34, 'professor', 'IN'), |
| (2, 35, 'at', 'IN'), |
| (2, 36, 'ohio', 'NNP'), |
| (2, 37, 'state', 'NNP'), |
| (2, 38, 'in', 'NNP'), |
| (2, 39, 'columbus', 'NNP'), |
| (2, 40, '.', '.'), |
| (3, 0, 'carriers', 'NNS'), |
| (3, 1, 'could', 'VBP'), |
| (3, 2, 'use', 'VBN'), |
| (3, 3, 'their', 'PRP$'), |
| (3, 4, 'equipment', 'JJ'), |
| (3, 5, 'more', 'RBR'), |
| (3, 6, 'efficiently', 'JJ'), |
| (3, 7, ',', ','), |
| (3, 8, 'leading', 'VBG'), |
| (3, 9, 'to', 'TO'), |
| (3, 10, 'overcapacity', 'VB'), |
| (3, 11, 'they', 'PRP'), |
| (3, 12, 'were', 'VBP'), |
| (3, 13, 'eager', 'VBG'), |
| (3, 14, 'to', 'TO'), |
| (3, 15, 'fill', 'VB'), |
| (3, 16, '.', '.'), |
| (4, 0, 'the', 'DT'), |
| (4, 1, 'deregulation', 'NN'), |
| (4, 2, 'of', 'IN'), |
| (4, 3, 'railroads', 'NN'), |
| (4, 4, 'and', 'CC'), |
| (4, 5, 'trucking', 'NNP'), |
| (4, 6, 'companies', 'VBD'), |
| (4, 7, 'that', 'DT'), |
| (4, 8, 'began', 'NN'), |
| (4, 9, 'in', 'IN'), |
| (4, 10, '1980', 'CD'), |
| (4, 11, 'enabled', 'VBN'), |
| (4, 12, 'shippers', 'NNS'), |
| (4, 13, 'to', 'TO'), |
| (4, 14, 'bargain', 'VB'), |
| (4, 15, 'for', 'IN'), |
| (4, 16, 'transportation', 'NN'), |
| (4, 17, '.', '.'); |
| |
| |
| -- Compare the expected result and the viterbi extraction result. It succeeds |
| -- only if the two tables are the same. |
| SELECT assert(s1.count+s2.count = 0, 'Labels predicted do not match expected labels.') |
| FROM ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction_new |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction_new |
| ) AS U |
| )s1, |
| ( |
| SELECT count(*) FROM( |
| SELECT doc_id, start_pos, seg_text, label |
| FROM extraction_new |
| EXCEPT ALL |
| SELECT doc_id, start_pos, seg_text, label |
| FROM expected_extraction_new |
| ) AS U |
| )s2; |