blob: 96d5f5f57b8148d8d27a5da69bd10e4af86d98c6 [file] [log] [blame]
---------------------------------------------------------------------------
-- Rules:
-- ------
-- 1) Any DB objects should be created w/o schema prefix,
-- since this file is executed in a separate schema context.
-- 2) There should be no DROP statements in this script, since
-- all objects created in the default schema will be cleaned-up outside.
---------------------------------------------------------------------------
m4_include(`SQLCommon.m4')
---------------------------------------------------------------------------
-- Build vocabulary:
---------------------------------------------------------------------------
CREATE TABLE lda_vocab(wordid INT4, word TEXT);
INSERT INTO lda_vocab VALUES
(0, 'code'), (1, 'data'), (2, 'graph'), (3, 'image'), (4, 'input'), (5,
'layer'), (6, 'learner'), (7, 'loss'), (8, 'model'), (9, 'network'), (10,
'neuron'), (11, 'object'), (12, 'output'), (13, 'rate'), (14, 'set'), (15,
'signal'), (16, 'sparse'), (17, 'spatial'), (18, 'system'), (19, 'training');
---------------------------------------------------------------------------
-- Build training dataset:
---------------------------------------------------------------------------
CREATE TABLE lda_training
(
docid INT4,
wordid INT4,
count INT4
);
INSERT INTO lda_training VALUES
(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
19, 1);
CREATE TABLE lda_training_odd_voc_size
(
docid INT4,
wordid INT4,
count INT4
);
INSERT INTO lda_training_odd_voc_size VALUES
(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
19, 1),(9, 20, 1);
CREATE TABLE lda_testing
(
docid INT4,
wordid INT4,
count INT4
);
INSERT INTO lda_testing VALUES
(0, 0, 2),(0, 8, 1),(0, 9, 1),(0, 10, 1),(0, 12, 1),(0, 15, 2),(0, 18, 1),(0,
19, 1), (1, 0, 1),(1, 2, 1),(1, 5, 1),(1, 7, 1),(1, 12, 2),(1, 13, 1),(1, 16,
1),(1, 17, 1),(1, 18, 1), (2, 0, 1),(2, 1, 1),(2, 2, 1),(2, 3, 1),(2, 4, 1),(2,
5, 1),(2, 6, 1),(2, 12, 1),(2, 14, 1),(2, 18, 1), (3, 2, 2),(3, 6, 2),(3, 7,
1),(3, 9, 1),(3, 11, 2),(3, 14, 1),(3, 15, 1), (4, 1, 1),(4, 2, 2),(4, 3,
1),(4, 5, 2),(4, 6, 1),(4, 11, 1),(4, 18, 2);
---------------------------------------------------------------------------
-- Test
---------------------------------------------------------------------------
SELECT lda_train(
'lda_training',
'lda_model',
'lda_output_data',
20, 5, 2, 10, 0.01);
SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
FROM lda_model;
SELECT
assert(
topic_num IS NOT NULL AND topic_num = 5,
'topic_num should be 5!'
),
assert(
sum_of_topic_counts IS NOT NULL AND sum_of_word_counts IS NOT NULL
AND sum_of_topic_counts = sum_of_word_counts,
'sum_of_topic_counts (' || sum_of_topic_counts::text || ') != sum_of_word_counts (' || sum_of_word_counts::text || ')!'
)
FROM
(
SELECT
array_upper((parsed_model).total_topic_counts, 1) AS topic_num,
array_sum((parsed_model).total_topic_counts) AS sum_of_topic_counts
FROM
(
SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
FROM lda_model
) subq_parsed_model
) subq_topic_num,
(
SELECT sum(count) AS sum_of_word_counts FROM lda_training
)subq_word_count;
SELECT lda_predict(
'lda_testing',
'lda_model',
'lda_pred');
SELECT lda_predict(
'lda_testing',
'lda_model',
'lda_pred_2',
5);
SELECT lda_get_perplexity(
'lda_model',
'lda_pred');
SELECT __lda_util_index_sort(array[1, 4, 2, 3]);
SELECT __lda_util_transpose(array[[1, 2, 3],[4, 5, 6]]);
SELECT assert(count(*) = 2, 'Wrong answer: __lda_util_unnest_transpose()')
FROM
(
SELECT __lda_util_unnest_transpose(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
) subq;
SELECT assert(count(*) = 3, 'Wrong answer: __lda_util_unnest()')
FROM
(
SELECT __lda_util_unnest(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
) subq;
SELECT __lda_util_norm_with_smoothing(array[1, 4, 2, 3], 0.1);
-- no words hit ceiling
SELECT
assert(
__lda_check_count_ceiling(
array[0, 0, 0, 0]::bigint[],
2,
2)
IS NULL,
'__lda_check_count_ceiling should return NULL for [0, 0, 0, 0]');
-- length: 1
SELECT
assert(
__lda_check_count_ceiling(
array[-1, -1, -1, -1]::bigint[],
2,
2)
IS NOT NULL,
'__lda_check_count_ceiling should not return NULL for [-1, -1, -1, -1]');
SELECT lda_get_topic_desc(
'lda_model',
'lda_vocab',
'topic_word_desc',
5);
SELECT lda_get_topic_word_count(
'lda_model',
'topic_word_count');
SELECT lda_get_word_topic_count(
'lda_model',
'topic_word_count_2');
SELECT lda_get_word_topic_mapping(
'lda_output_data',
'word_topic_mapping');
SELECT *
FROM __lda_util_norm_vocab('lda_vocab', 'norm_lda_vocab');
SELECT *
FROM __lda_util_norm_dataset('lda_testing', 'norm_lda_vocab',
'norm_lda_data');
SELECT *
FROM __lda_util_conorm_data('lda_testing', 'lda_vocab',
'norm_lda_data_2', 'norm_lda_vocab_2');
-- both voc_size and topic_num are odd or even
SELECT lda_train(
'lda_training_odd_voc_size',
'lda_model_odd_voc_size_even_topic_num',
'lda_output_odd_voc_size_even_topic_num',
21, 6, 2, 3, 0.01);
SELECT lda_train(
'lda_training_odd_voc_size',
'lda_model_odd_voc_size_odd_topic_num',
'lda_output_odd_voc_size_odd_topic_num',
21, 5, 2, 3, 0.01);
SELECT lda_train(
'lda_training',
'lda_model_even_voc_size_even_topic_num',
'lda_output_even_voc_size_even_topic_num',
20, 6, 2, 3, 0.01);
DROP TABLE IF EXISTS documents;
CREATE TABLE documents(docid INT4, contents TEXT);
INSERT INTO documents VALUES
(0, 'b a a c'),
(1, 'd e f f f');
ALTER TABLE documents ADD COLUMN words TEXT[];
UPDATE documents SET words = regexp_split_to_array(lower(contents), E'[\\s+\\.\\,]');
DROP TABLE IF EXISTS my_training, my_training_vocabulary;
SELECT term_frequency('documents', 'docid', 'words', 'my_training', TRUE);
DROP TABLE IF EXISTS my_model, my_outdata;
SELECT lda_train(
'my_training',
'my_model',
'my_outdata',
6, 2, 2, 3, 0.01);
DROP TABLE IF EXISTS word_topic_count;
SELECT lda_get_word_topic_count( 'my_model', 'word_topic_count');
SELECT * FROM word_topic_count ORDER BY wordid;
-- This function will validate that the output of lda_get_word_topic_count() is consistent with the output of lda_train
CREATE OR REPLACE FUNCTION validate_lda_output() RETURNS integer AS $$
DECLARE
-- variables for looping through the word_topic_mapping table
word_topic_mapping_row RECORD;
word_topic_mapping_wordid int;
word_topic_mapping_topicid int;
-- we use the following array from word_topic_mapping to compare with the output of lda_get_word_topic_count()
word_topic_mapping_array INT[6][2] := ARRAY[[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]];
word_topic_mapping_topic_count int[];
-- variables for looping through the output of lda_get_word_topic_count()
word_topic_count_row RECORD;
word_topic_count_topic_count int[];
BEGIN
-- Create helper table from out_data
DROP TABLE IF EXISTS word_topic_mapping;
CREATE table word_topic_mapping as SELECT wordid, topicid FROM (
SELECT unnest(svec_from_string('{' || array_to_string(counts, ',') || '}:{' || array_to_string(words, ',') || '}')::float[]) AS wordid,
unnest(topic_assignment) AS topicid FROM my_outdata) a
order by wordid ;
-- Construct the mapping array
FOR word_topic_mapping_row IN SELECT * FROM word_topic_mapping ORDER BY wordid LOOP
-- we need to add 1 because postgres array starts from 1 and our wordid and topicid start from 0
word_topic_mapping_wordid := word_topic_mapping_row.wordid+1;
word_topic_mapping_topicid := word_topic_mapping_row.topicid+1;
word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] := word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] + 1;
END LOOP;
-- Loop through the output of lda_get_word_topic_count() and compare the results with the helper table word_topic_mapping
FOR word_topic_count_row IN select * from word_topic_count ORDER BY wordid LOOP
word_topic_count_topic_count := ARRAY[word_topic_count_row.topic_count];
word_topic_mapping_topic_count := word_topic_mapping_array[word_topic_count_row.wordid+1:word_topic_count_row.wordid+1]; -- Here arrayX[i:i] means the ith 1d array of a 2d array
IF (word_topic_mapping_topic_count != word_topic_count_topic_count) THEN
RAISE EXCEPTION 'Topic assignment for wordid % does not match. word_topic_mapping: % word_topic_count: %',word_topic_count_row.wordid, word_topic_mapping_topic_count, word_topic_count_topic_count;
END IF;
END LOOP;
RETURN 1;
END;
$$ LANGUAGE plpgsql;
select validate_lda_output();
---------- TEST CASES FOR PERPLEXITY ----------
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
2, -- iter_num
10, -- alpha
0.01, -- beta
2, -- evaluate_every
.2); -- perplexity_tol
SELECT assert(perplexity_iters = '{2}', 'Number of Perplexity iterations are wrong') FROM lda_model;
SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
-- Commenting the below flaky test to re-visit later.
-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
3, -- iter_num
10, -- alpha
0.01, -- beta
1, -- evaluate_every
.1 -- perplexity_tol
);
SELECT assert(array_upper(perplexity,1) = 3, 'Perplexity calculation is wrong') FROM lda_model;
SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
-- Commenting the below flaky test to re-visit later.
-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;
-- Function to check if the perplexity value returned from the function
-- and calculated by the train funcion are same.
CREATE OR REPLACE FUNCTION validate_perplexity() RETURNS boolean AS $$
DECLARE
perplexity_from_func Double precision[];
perplexity_lda_train Double precision[];
BEGIN
drop table if exists lda_model, lda_output_data;
PERFORM lda_train(
'lda_training',
'lda_model',
'lda_output_data',
20, 5, 2, 10, 0.01, 2, .2);
SELECT array_agg(round(lda_get_perplexity::numeric,10)) INTO perplexity_from_func from lda_get_perplexity('lda_model','lda_output_data');
select perplexity INTO perplexity_lda_train from lda_model ;
if perplexity_lda_train != perplexity_from_func THEN
return FALSE;
ELSE
return TRUE;
END IF;
END;
$$ LANGUAGE plpgsql;
SELECT assert(validate_perplexity() = TRUE, 'Perplexity calculation is wrong');
SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
-- Commenting the below flaky test to re-visit later.
-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;
-- Test for evaluate_every = Number of iterations = 1. It should give exactly one perplexity value --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
1, -- iter_num
10, -- alpha
0.01, -- beta
1, -- evaluate_every
.1 -- perplexity_tol
);
select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;
select assert(array_upper(perplexity,1) = 1, 'Perplexity should not have more than 1 value') from lda_model;
-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity--
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
1, -- iter_num
10, -- alpha
0.01, -- beta
0, -- evaluate_every
.1 -- perplexity_tol
);
select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model;
select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;
-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity--
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
1, -- iter_num
10, -- alpha
0.01, -- beta
-1, -- evaluate_every
.1 -- perplexity_tol
);
select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;
-- Test to check if the perplexity_iters are matching the expected value --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
2, -- evaluate_every
.1 -- perplexity_tol
);
SELECT assert(array_upper(perplexity_iters,1) <= 5, 'Perplexity iterations are different from expected') FROM lda_model ;
SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
-- Commenting the below flaky test to re-visit later.
-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;
-- Test: If the difference between any two iterations is less than the perplexity_tol, we will stop the training. --
-- In this case, it will iterate to two iterations only as the perplexity_tol is very large and the difference between the 2 and 4th iteration --
-- will be less than 10, so Only 2 iterations will be recorded --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
2, -- evaluate_every
100 -- perplexity_tol
);
SELECT assert(abs(perplexity[2] - perplexity[1]) <100, 'Perplexity tol is less than the perplexity difference') FROM lda_model ;
SELECT assert(array_upper(perplexity_iters,1) = 2, 'Perplexity iterations are different from expected') FROM lda_model ;
SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
-- Commenting the below flaky test to re-visit later.
-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;
-- Test for evaluate_every = 1 and 0 : In this case the iterations should not stop early --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
1, -- evaluate_every
0 -- perplexity_tol
);
select assert(num_iterations = 10, 'Perplexity should run for all the iterations') from lda_model ;
-- Test for evaluate_every = NULL and perplexity_tol = NULL. In this case it should not calculate perplexity --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
NULL, -- evaluate_every
NULL -- perplexity_tol
);
select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;
-- Test for evaluate_every = 1 and perplexity_tol = NULL. --
-- In this case it should calculate perplexity with perplexity_tol = 0.1 as default value --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
1, -- evaluate_every
NULL -- perplexity_tol
);
select assert(array_upper(perplexity_iters,1) >= 1, 'Perplexity iterations are different from expected') from lda_model ;
select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;
-- Test for evaluate_every = NULL and perplexity_tol != NULL --
-- In this case it should not calculate perplexity --
drop table if exists lda_model, lda_output_data;
SELECT lda_train(
'lda_training', -- data_table
'lda_model', -- model_table
'lda_output_data', -- output_data_table
20, -- voc_size
5, -- topic_num
10, -- iter_num
10, -- alpha
0.01, -- beta
NULL, -- evaluate_every
1 -- perplexity_tol
);
select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;