blob: ec49d4b9ac01957486e2516340782dd2dfc9ad5e [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Setup:
---------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION fill_feature(pre_class INT, p FLOAT, total INT)
RETURNS FLOAT AS $$
declare
result FLOAT;
thres INT;
begin
thres = total*p;
IF pre_class <= thres THEN
result = 1;
ELSE
result = 0;
END IF;
RETURN result;
end
$$ language plpgsql;
CREATE OR REPLACE FUNCTION fill_feature3(pre_class INT, p1 FLOAT, p2 FLOAT, total INT)
RETURNS FLOAT AS $$
declare
result FLOAT;
thres1 INT;
thres2 INT;
begin
thres1 = total*p1;
thres2 = total*(p2+p1);
IF pre_class <= thres1 THEN
result = 2;
ELSIF pre_class <= thres2 THEN
result = 1;
ELSE
result = 0;
END IF;
RETURN result;
end
$$ language plpgsql;
-- ----------------
-- install_test_1()
-- ----------------
CREATE FUNCTION install_test_1() RETURNS VOID AS $$
declare
num1 INT := 10;
num2 INT := 10;
result1 INT;
count1 INT;
tempvar INT[];
begin
-- prepare training data: equal priors
--DROP TABLE IF EXISTS data CASCADE;
CREATE TABLE data_1( class INT, attrib FLOAT[] );
INSERT INTO data_1 SELECT 1, ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1)] FROM generate_series(1,num1) as id;
INSERT INTO data_1 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM generate_series(1,num2) as id;
-- prepare testing data
--DROP TABLE IF EXISTS data_test CASCADE;
CREATE TABLE data_test_1( id INT, attrib INT[], prob FLOAT[] );
INSERT INTO data_test_1 VALUES (1,'{0,0}','{0.4,0.6}');
INSERT INTO data_test_1 VALUES (2,'{0,1}','{0.66666,0.33333}');
INSERT INTO data_test_1 VALUES (3,'{1,0}','{0.25,0.75}');
INSERT INTO data_test_1 VALUES (4,'{1,1}','{0.5,0.5}');
-- prepare true results for testing data
--DROP TABLE IF EXISTS data_results;
CREATE TABLE data_results_1 AS
SELECT
id,
unnest('{1,2}'::INT[]) as class,
unnest(prob) as prob
FROM data_test_1;
-- Process training
--DROP TABLE IF EXISTS probs CASCADE;
--DROP TABLE IF EXISTS priors CASCADE;
PERFORM MADLIB_SCHEMA.create_nb_prepared_data_tables('data_1','class','attrib',2,'probs','priors');
-- Classify
--DROP VIEW IF EXISTS results;
PERFORM MADLIB_SCHEMA.create_nb_classify_view('probs','priors','data_test_1','id','attrib',2,'results_1');
-- Compute posteriors
--DROP VIEW IF EXISTS probs_view;
PERFORM MADLIB_SCHEMA.create_nb_probs_view('probs','priors','data_test_1','id','attrib',2,'probs_view_1');
-- Check the results
SELECT * INTO result1,count1 FROM
(SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*)
FROM data_results_1 data_results INNER JOIN probs_view_1 as probs_view
ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t;
IF (result1 != 0) OR (count1 != 8) THEN
RAISE EXCEPTION 'Incorrect probabilities';
END IF;
-- Check classification: case {1,1} should have equal posteriors, so should return 2 classes
SELECT * INTO tempvar
FROM (SELECT nb_classification FROM results_1 WHERE key=4) as a;
SELECT count(*) INTO result1
FROM (SELECT unnest(tempvar)) as b;
IF (result1 != 2) THEN
RAISE EXCEPTION 'Incorrect classification';
END IF;
-- Repeat using function w/out preprocessing priors
-- Classify
--DROP VIEW IF EXISTS results;
--PERFORM MADLIB_SCHEMA.create_nb_classify_view('data','class','attrib','data_test','id','attrib',2,'results');
-- Compute posteriors
--DROP VIEW IF EXISTS probs_view;
--PERFORM MADLIB_SCHEMA.create_nb_probs_view('data','class','attrib','data_test','id','attrib',2,'probs_view');
end
$$ language plpgsql;
-- ----------------
-- install_test_2()
-- ----------------
CREATE FUNCTION install_test_2() RETURNS VOID AS $$
declare
num1 INT := 10;
num2 INT := 10;
result1 INT;
count1 INT;
tempvar INT[];
begin
-- 3 class case
CREATE TABLE data_2( class INT, attrib FLOAT[] );
INSERT INTO data_2 SELECT 1, ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1)] FROM generate_series(1,num1) as id;
INSERT INTO data_2 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM generate_series(1,num2) as id;
INSERT INTO data_2 SELECT 3, ARRAY[fill_feature3(id,0.3,0.5,num1),fill_feature(id,0.6,num1)] FROM generate_series(1,num1) as id;
--DROP TABLE IF EXISTS data_test CASCADE;
CREATE TABLE data_test_2( id INT, attrib INT[], prob FLOAT[] );
INSERT INTO data_test_2 VALUES (1,'{0,0}','{0.32,0.48,0.2}');
INSERT INTO data_test_2 VALUES (2,'{0,1}','{0.5581,0.279,0.1627}');
INSERT INTO data_test_2 VALUES (3,'{1,0}','{0.1538,0.4615,0.3846}');
INSERT INTO data_test_2 VALUES (4,'{1,1}','{0.3157,0.3157,0.3684}');
INSERT INTO data_test_2 VALUES (5,'{2,0}','{0.1034,0.2068,0.6897}');
INSERT INTO data_test_2 VALUES (6,'{2,1}','{0.2093,0.1395,0.6511}');
--DROP TABLE IF EXISTS data_results;
CREATE TABLE data_results_2 AS
SELECT
id,
unnest('{1,2,3}'::INT[]) as class,
unnest(prob) as prob
FROM data_test_2;
--DROP TABLE IF EXISTS probs CASCADE;
--DROP TABLE IF EXISTS priors CASCADE;
PERFORM create_nb_prepared_data_tables('data_2','class','attrib',2,'probs_2','priors_2');
--DROP VIEW IF EXISTS results;
PERFORM create_nb_classify_view('probs_2','priors_2','data_test_2','id','attrib',2,'results_2');
--DROP VIEW IF EXISTS probs_view;
PERFORM create_nb_probs_view('probs_2','priors_2','data_test_2','id','attrib',2,'probs_view_2');
SELECT * INTO result1,count1 FROM
(SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*)
FROM data_results_2 as data_results INNER JOIN probs_view_2 as probs_view
ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t;
IF (result1 != 0) OR (count1 != 18) THEN
RAISE EXCEPTION 'Incorrect probabilities for 3-class, got %',result1;
END IF;
RAISE INFO 'Naive Bayes install checks passed';
RETURN;
end
$$ language plpgsql;
-- ----------------
-- install_test_3()
-- ----------------
CREATE FUNCTION install_test_3() RETURNS VOID AS $$
declare
num1 INT := 10;
num2 INT := 10;
result1 INT;
count1 INT;
tempvar INT[];
begin
-- Unequal prior probabilities
--DROP TABLE IF EXISTS data CASCADE;
CREATE TABLE data_3( class INT, attrib FLOAT[] );
INSERT INTO data_3 SELECT 1, ARRAY[fill_feature(id,0.3,num1*2),fill_feature(id,0.8,num1*2)] FROM generate_series(1,num1*2) as id;
INSERT INTO data_3 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2)] FROM generate_series(1,num2) as id;
-- prepare testing data
--DROP TABLE IF EXISTS data_test CASCADE;
CREATE TABLE data_test_3( id INT, attrib INT[], prob FLOAT[] );
INSERT INTO data_test_3 VALUES (1,'{0,0}','{0.5535,0.4464}');
INSERT INTO data_test_3 VALUES (2,'{0,1}','{0.8082,0.1917}');
INSERT INTO data_test_3 VALUES (3,'{1,0}','{0.3664,0.6335}');
INSERT INTO data_test_3 VALUES (4,'{1,1}','{0.6629,0.337}');
-- prepare true results for testing data
--DROP TABLE IF EXISTS data_results;
CREATE TABLE data_results_3 AS
SELECT
id,
unnest('{1,2}'::INT[]) as class,
unnest(prob) as prob
FROM data_test_3;
-- Process training
--DROP TABLE IF EXISTS probs CASCADE;
--DROP TABLE IF EXISTS priors CASCADE;
PERFORM MADLIB_SCHEMA.create_nb_prepared_data_tables('data_3','class','attrib',2,'probs_3','priors_3');
-- Classify
--DROP VIEW IF EXISTS results;
PERFORM MADLIB_SCHEMA.create_nb_classify_view('probs_3','priors_3','data_test_3','id','attrib',2,'results_3');
-- Compute posteriors
--DROP VIEW IF EXISTS probs_view;
PERFORM MADLIB_SCHEMA.create_nb_probs_view('probs_3','priors_3','data_test_3','id','attrib',2,'probs_view_3');
-- Check the results
SELECT * INTO result1,count1 FROM
(SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*)
FROM data_results_3 as data_results INNER JOIN probs_view_3 as probs_view
ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t;
IF (result1 != 0) OR (count1 != 8) THEN
RAISE EXCEPTION 'Incorrect probabilities';
END IF;
RAISE INFO 'Naive Bayes install checks passed';
RETURN;
end
$$ language plpgsql;
-----------------------------------------------------------
--Test where numeric probabilities cancel out, leaving
--only nominal probabilities to decide the class
-----------------------------------------------------------
CREATE FUNCTION install_test_4() RETURNS VOID AS $$
declare
num1 INT := 10;
num2 INT := 10;
result1 INT;
count1 INT;
tempvar INT[];
begin
--equal class priors and numerical attr probabilities
--DROP TABLE IF EXISTS data CASCADE;
CREATE TABLE data_4( class INT, attrib FLOAT[] );
INSERT INTO data_4 SELECT 1, ARRAY[fill_feature(id,0.3,num1),fill_feature(id,0.8,num1),id] FROM generate_series(1,num1) as id;
INSERT INTO data_4 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.5,num2),id] FROM generate_series(1,num2) as id;
-- prepare testing data
--DROP TABLE IF EXISTS data_test CASCADE;
CREATE TABLE data_test_4( id INT, attrib NUMERIC[], prob FLOAT[] );
INSERT INTO data_test_4 VALUES (1,'{0,0,1.25}','{0.4,0.6}');
INSERT INTO data_test_4 VALUES (2,'{0,1,2.0}','{0.66666,0.33333}');
INSERT INTO data_test_4 VALUES (3,'{1,0,4}','{0.25,0.75}');
INSERT INTO data_test_4 VALUES (4,'{1,1,1.3}','{0.5,0.5}');
-- prepare true results for testing data
--DROP TABLE IF EXISTS data_results;
CREATE TABLE data_results_4 AS
SELECT
id,
unnest('{1,2}'::INT[]) as class,
unnest(prob) as prob
FROM data_test_4;
-- Process training
--DROP TABLE IF EXISTS probs CASCADE;
--DROP TABLE IF EXISTS priors CASCADE;
PERFORM MADLIB_SCHEMA.create_nb_prepared_data_tables('data_4','class','attrib','ARRAY[3]',3,'probs_categ_4','probs_numeric_4','priors_4');
-- Classify
--DROP VIEW IF EXISTS results;
PERFORM MADLIB_SCHEMA.create_nb_classify_view('probs_categ_4','priors_4','data_test_4','id','attrib',3,'probs_numeric_4','results_4');
-- Compute posteriors
--DROP VIEW IF EXISTS probs_view;
PERFORM MADLIB_SCHEMA.create_nb_probs_view('probs_categ_4','priors_4','data_test_4','id','attrib',3,'probs_numeric_4','probs_view_4');
-- Check the results
SELECT * INTO result1,count1 FROM
(SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*)
FROM data_results_4 as data_results INNER JOIN probs_view_4 as probs_view
ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t;
IF (result1 != 0) OR (count1 != 8) THEN
RAISE EXCEPTION 'Incorrect probabilities';
END IF;
RAISE INFO 'Naive Bayes install checks passed';
RETURN;
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- Test:
---------------------------------------------------------------------------
---------------------------------------------------------------------------
-- Test where nominal probabilities cancel out,leaving only numeric
-- probabilities to decide the classes.
---------------------------------------------------------------------------
CREATE FUNCTION install_test_5() RETURNS VOID AS $$
declare
num1 INT := 10;
num2 INT := 10;
result1 INT;
count1 INT;
tempvar INT[];
begin
--equal class priors and numerical attr probabilities
--DROP TABLE IF EXISTS data CASCADE;
CREATE TABLE data_5( class INT, attrib FLOAT[] );
INSERT INTO data_5 SELECT 1, ARRAY[fill_feature(id,0.5,num1),fill_feature(id,0.8,num1),id] FROM generate_series(1,num1) as id;
INSERT INTO data_5 SELECT 2, ARRAY[fill_feature(id,0.5,num2),fill_feature(id,0.8,num2),id%5] FROM generate_series(1,num2) as id;
-- prepare testing data
--DROP TABLE IF EXISTS data_test CASCADE;
CREATE TABLE data_test_5( id INT, attrib float8[], prob FLOAT8[] );
INSERT INTO data_test_5 VALUES (1,'{0,0,3.5}','{0.3964055,0.6035944}');
INSERT INTO data_test_5 VALUES (2,'{0,1,4.5}','{0.6554700,0.3445299}');
INSERT INTO data_test_5 VALUES (3,'{1,0,2.0}','{0.2015371,0.7984628}');
INSERT INTO data_test_5 VALUES (4,'{1,1,3.2}','{0.3378078,0.6621921}');
-- prepare true results for testing data
--DROP TABLE IF EXISTS data_results;
CREATE TABLE data_results_5 AS
SELECT
id,
unnest('{1,2}'::INT[]) as class,
unnest(prob) as prob
FROM data_test_5;
-- Process training
--DROP TABLE IF EXISTS probs CASCADE;
--DROP TABLE IF EXISTS priors CASCADE;
PERFORM MADLIB_SCHEMA.create_nb_prepared_data_tables('data_5','class','attrib','ARRAY[3]',3,'probs_categ_5','probs_numeric_5','priors_5');
-- Classify
--DROP VIEW IF EXISTS results;
PERFORM MADLIB_SCHEMA.create_nb_classify_view('probs_categ_5','priors_5','data_test_5','id','attrib',3,'probs_numeric_5','results_5');
-- Compute posteriors
--DROP VIEW IF EXISTS probs_view;
PERFORM MADLIB_SCHEMA.create_nb_probs_view('probs_categ_5','priors_5','data_test_5','id','attrib',3,'probs_numeric_5','probs_view_5');
-- Check the results
SELECT * INTO result1,count1 FROM
(SELECT sum( abs( (data_results.prob*100)::INT - (probs_view.nb_prob*100)::INT) ),count(*)
FROM data_results_5 as data_results INNER JOIN probs_view_5 as probs_view
ON (data_results.id = probs_view.key AND data_results.class = probs_view.class)) AS t;
IF (result1 != 0) OR (count1 != 8) THEN
RAISE EXCEPTION 'Incorrect probabilities';
END IF;
RAISE INFO 'Naive Bayes install checks passed';
RETURN;
end
$$ language plpgsql;
---------------------------------------------------------------------------
-- Test:
---------------------------------------------------------------------------
SELECT install_test_1();
SELECT install_test_2();
SELECT install_test_3();
SELECT install_test_4();
SELECT install_test_5();