src/ports/postgres/modules/lda/test/lda.sql_in - madlib - Git at Google

 ---------------------------------------------------------------------------
 -- Rules:
 -- ------
 -- 1) Any DB objects should be created w/o schema prefix,
 --    since this file is executed in a separate schema context.
 -- 2) There should be no DROP statements in this script, since
 --    all objects created in the default schema will be cleaned-up outside.
 ---------------------------------------------------------------------------
 m4_include(`SQLCommon.m4')

 ---------------------------------------------------------------------------
 -- Build vocabulary:
 ---------------------------------------------------------------------------
 CREATE TABLE lda_vocab(wordid INT4, word TEXT);

 INSERT INTO lda_vocab VALUES
 (0, 'code'), (1, 'data'), (2, 'graph'), (3, 'image'), (4, 'input'), (5,
 'layer'), (6, 'learner'), (7, 'loss'), (8, 'model'), (9, 'network'), (10,
 'neuron'), (11, 'object'), (12, 'output'), (13, 'rate'), (14, 'set'), (15,
 'signal'), (16, 'sparse'), (17, 'spatial'), (18, 'system'), (19, 'training');

 ---------------------------------------------------------------------------
 -- Build training dataset:
 ---------------------------------------------------------------------------
 CREATE TABLE lda_training
 (
     docid INT4,
     wordid INT4,
     count INT4
 );

 INSERT INTO lda_training VALUES
 (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
 (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
 19, 1);

 CREATE TABLE lda_training_odd_voc_size
 (
     docid INT4,
     wordid INT4,
     count INT4
 );

 INSERT INTO lda_training_odd_voc_size VALUES
 (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
 (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
 19, 1),(9, 20, 1);


 CREATE TABLE lda_testing
 (
     docid INT4,
     wordid INT4,
     count INT4
 );

 INSERT INTO lda_testing VALUES
 (0, 0, 2),(0, 8, 1),(0, 9, 1),(0, 10, 1),(0, 12, 1),(0, 15, 2),(0, 18, 1),(0,
 19, 1), (1, 0, 1),(1, 2, 1),(1, 5, 1),(1, 7, 1),(1, 12, 2),(1, 13, 1),(1, 16,
 1),(1, 17, 1),(1, 18, 1), (2, 0, 1),(2, 1, 1),(2, 2, 1),(2, 3, 1),(2, 4, 1),(2,
 5, 1),(2, 6, 1),(2, 12, 1),(2, 14, 1),(2, 18, 1), (3, 2, 2),(3, 6, 2),(3, 7,
 1),(3, 9, 1),(3, 11, 2),(3, 14, 1),(3, 15, 1), (4, 1, 1),(4, 2, 2),(4, 3,
 1),(4, 5, 2),(4, 6, 1),(4, 11, 1),(4, 18, 2);

 ---------------------------------------------------------------------------
 -- Test
 ---------------------------------------------------------------------------
 SELECT lda_train(
     'lda_training',
     'lda_model',
     'lda_output_data',
     20, 5, 2, 10, 0.01);


 SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
 FROM lda_model;


 SELECT
     assert(
         topic_num IS NOT NULL AND topic_num = 5,
         'topic_num should be 5!'
     ),
     assert(
         sum_of_topic_counts IS NOT NULL AND sum_of_word_counts IS NOT NULL
         AND sum_of_topic_counts = sum_of_word_counts,
         'sum_of_topic_counts (' || sum_of_topic_counts::text || ') != sum_of_word_counts (' || sum_of_word_counts::text || ')!'
     )
 FROM
 (
     SELECT
         array_upper((parsed_model).total_topic_counts, 1) AS topic_num,
         array_sum((parsed_model).total_topic_counts) AS sum_of_topic_counts
     FROM
     (
         SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
         FROM lda_model
     ) subq_parsed_model
 ) subq_topic_num,
 (
     SELECT sum(count) AS sum_of_word_counts FROM lda_training
 )subq_word_count;

 SELECT lda_predict(
     'lda_testing',
     'lda_model',
     'lda_pred');

 SELECT lda_predict(
     'lda_testing',
     'lda_model',
     'lda_pred_2',
     5);

 SELECT lda_get_perplexity(
     'lda_model',
     'lda_pred');

 SELECT __lda_util_index_sort(array[1, 4, 2, 3]);
 SELECT __lda_util_transpose(array[[1, 2, 3],[4, 5, 6]]);
 SELECT assert(count(*) = 2, 'Wrong answer: __lda_util_unnest_transpose()')
 FROM
 (
     SELECT __lda_util_unnest_transpose(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
 ) subq;
 SELECT assert(count(*) = 3, 'Wrong answer: __lda_util_unnest()')
 FROM
 (
     SELECT __lda_util_unnest(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
 ) subq;
 SELECT __lda_util_norm_with_smoothing(array[1, 4, 2, 3], 0.1);

 -- no words hit ceiling
 SELECT
     assert(
         __lda_check_count_ceiling(
             array[0, 0, 0, 0]::bigint[],
             2,
             2)
         IS NULL,
         '__lda_check_count_ceiling should return NULL for [0, 0, 0, 0]');
 -- length: 1
 SELECT
     assert(
         __lda_check_count_ceiling(
             array[-1, -1, -1, -1]::bigint[],
             2,
             2)
         IS NOT NULL,
         '__lda_check_count_ceiling should not return NULL for [-1, -1, -1, -1]');

 SELECT lda_get_topic_desc(
     'lda_model',
     'lda_vocab',
     'topic_word_desc',
     5);

 SELECT lda_get_topic_word_count(
     'lda_model',
     'topic_word_count');

 SELECT lda_get_word_topic_count(
     'lda_model',
     'topic_word_count_2');

 SELECT lda_get_word_topic_mapping(
     'lda_output_data',
     'word_topic_mapping');

 SELECT *
 FROM __lda_util_norm_vocab('lda_vocab', 'norm_lda_vocab');

 SELECT *
 FROM __lda_util_norm_dataset('lda_testing', 'norm_lda_vocab',
 'norm_lda_data');

 SELECT *
 FROM __lda_util_conorm_data('lda_testing', 'lda_vocab',
 'norm_lda_data_2', 'norm_lda_vocab_2');

 -- both voc_size and topic_num are odd or even
 SELECT lda_train(
     'lda_training_odd_voc_size',
     'lda_model_odd_voc_size_even_topic_num',
     'lda_output_odd_voc_size_even_topic_num',
     21, 6, 2, 3, 0.01);

 SELECT lda_train(
     'lda_training_odd_voc_size',
     'lda_model_odd_voc_size_odd_topic_num',
     'lda_output_odd_voc_size_odd_topic_num',
     21, 5, 2, 3, 0.01);

 SELECT lda_train(
     'lda_training',
     'lda_model_even_voc_size_even_topic_num',
     'lda_output_even_voc_size_even_topic_num',
     20, 6, 2, 3, 0.01);

 DROP TABLE IF EXISTS documents;
 CREATE TABLE documents(docid INT4, contents TEXT);
 INSERT INTO documents VALUES
 (0, 'b a a c'),
 (1, 'd e f f f');

 ALTER TABLE documents ADD COLUMN words TEXT[];
 UPDATE documents SET words = regexp_split_to_array(lower(contents), E'[\\s+\\.\\,]');

 DROP TABLE IF EXISTS my_training, my_training_vocabulary;
 SELECT term_frequency('documents', 'docid', 'words', 'my_training', TRUE);


 DROP TABLE IF EXISTS my_model, my_outdata;
 SELECT lda_train(
     'my_training',
     'my_model',
     'my_outdata',
     6, 2, 2, 3, 0.01);

 DROP TABLE IF EXISTS word_topic_count;
 SELECT lda_get_word_topic_count( 'my_model', 'word_topic_count');
 SELECT * FROM word_topic_count ORDER BY wordid;

 -- This function will validate that the output of lda_get_word_topic_count() is consistent with the output of lda_train
 CREATE OR REPLACE FUNCTION validate_lda_output() RETURNS integer AS $$
     DECLARE
         -- variables for looping through the word_topic_mapping table
         word_topic_mapping_row RECORD;
         word_topic_mapping_wordid int;
         word_topic_mapping_topicid int;

         -- we use the following array from word_topic_mapping to compare with the output of lda_get_word_topic_count()
         word_topic_mapping_array INT[6][2] := ARRAY[[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]];
         word_topic_mapping_topic_count int[];

         -- variables for looping through the output of lda_get_word_topic_count()
         word_topic_count_row RECORD;
         word_topic_count_topic_count int[];
     BEGIN
         -- Create helper table from out_data
         DROP TABLE IF EXISTS word_topic_mapping;
         CREATE table word_topic_mapping as  SELECT wordid, topicid FROM (
              SELECT unnest(svec_from_string('{' || array_to_string(counts, ',') || '}:{' || array_to_string(words, ',') || '}')::float[]) AS wordid,
              unnest(topic_assignment) AS topicid FROM my_outdata) a
              order by wordid ;

         -- Construct the mapping array
         FOR word_topic_mapping_row IN SELECT * FROM word_topic_mapping ORDER BY wordid LOOP
             -- we need to add 1 because postgres array starts from 1 and our wordid and topicid start from 0
             word_topic_mapping_wordid := word_topic_mapping_row.wordid+1;
             word_topic_mapping_topicid := word_topic_mapping_row.topicid+1;

             word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] := word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] + 1;

         END LOOP;

         -- Loop through the output of lda_get_word_topic_count() and compare the results with the helper table word_topic_mapping
         FOR word_topic_count_row IN select * from word_topic_count ORDER BY wordid LOOP
             word_topic_count_topic_count := ARRAY[word_topic_count_row.topic_count];
             word_topic_mapping_topic_count := word_topic_mapping_array[word_topic_count_row.wordid+1:word_topic_count_row.wordid+1]; -- Here arrayX[i:i] means the ith 1d array of a 2d array

             IF (word_topic_mapping_topic_count != word_topic_count_topic_count) THEN
                 RAISE EXCEPTION 'Topic assignment for wordid % does not match. word_topic_mapping: % word_topic_count: %',word_topic_count_row.wordid, word_topic_mapping_topic_count, word_topic_count_topic_count;
             END IF;
         END LOOP;

         RETURN 1;
     END;
 $$ LANGUAGE plpgsql;

 select validate_lda_output();


 ---------- TEST CASES FOR PERPLEXITY ----------

 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',             -- data_table
     'lda_model',                -- model_table
     'lda_output_data',          -- output_data_table
     20,                         -- voc_size
     5,                          -- topic_num
     2,                          -- iter_num
     10,                         -- alpha
     0.01,                       -- beta
     2,                          -- evaluate_every
     .2);                        -- perplexity_tol

 SELECT assert(perplexity_iters = '{2}', 'Number of Perplexity iterations are wrong') FROM lda_model;
 SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
 -- Commenting the below flaky test to re-visit later.
 -- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     3,                       -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     1,                       -- evaluate_every
     .1                       -- perplexity_tol
     );

 SELECT assert(array_upper(perplexity,1) = 3, 'Perplexity calculation is wrong') FROM lda_model;
 SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
 -- Commenting the below flaky test to re-visit later.
 -- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


 -- Function to check if the perplexity value returned from the function
 -- and calculated by the train funcion are same.
 CREATE OR REPLACE FUNCTION validate_perplexity() RETURNS boolean AS $$

     DECLARE
         perplexity_from_func Double precision[];
         perplexity_lda_train Double precision[];

     BEGIN
         drop table if exists lda_model, lda_output_data;
         PERFORM lda_train(
         'lda_training',
         'lda_model',
         'lda_output_data',
         20, 5, 2, 10, 0.01, 2, .2);

          SELECT array_agg(round(lda_get_perplexity::numeric,10))  INTO perplexity_from_func from lda_get_perplexity('lda_model','lda_output_data');

           select perplexity INTO perplexity_lda_train from lda_model ;


         if perplexity_lda_train != perplexity_from_func  THEN
             return FALSE;
         ELSE
             return TRUE;
         END IF;

     END;

 $$ LANGUAGE plpgsql;

 SELECT assert(validate_perplexity() = TRUE, 'Perplexity calculation is wrong');
 SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
 -- Commenting the below flaky test to re-visit later.
 -- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;

 -- Test for evaluate_every = Number of iterations = 1. It should give exactly one perplexity value --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     1,                       -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     1,                       -- evaluate_every
     .1                       -- perplexity_tol
     );

 select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;
 select assert(array_upper(perplexity,1) = 1, 'Perplexity should not have more than 1 value') from lda_model;


 -- Test for evaluate_every = 0  and -1 : In this do not calculate perplexity--


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     1,                       -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     0,                       -- evaluate_every
     .1                       -- perplexity_tol
     );

 select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model;
 select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;

 -- Test for evaluate_every = 0  and -1 : In this do not calculate perplexity--


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     1,                       -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     -1,                      -- evaluate_every
     .1                       -- perplexity_tol
     );

 select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
 select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;


 -- Test to check if the perplexity_iters are matching the expected value --

 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     2,                       -- evaluate_every
     .1                       -- perplexity_tol
     );

 SELECT assert(array_upper(perplexity_iters,1) <= 5, 'Perplexity iterations are different from expected') FROM lda_model ;
 SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
 -- Commenting the below flaky test to re-visit later.
 -- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


 -- Test: If the difference between any two iterations is less than the perplexity_tol, we will stop the training. --
 -- In this case, it will iterate to two iterations only as the perplexity_tol is very large and the difference between the 2 and 4th iteration --
 -- will be less than 10, so Only 2 iterations will be recorded --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     2,                       -- evaluate_every
     100                      -- perplexity_tol
     );


 SELECT assert(abs(perplexity[2] - perplexity[1]) <100, 'Perplexity tol is less than the perplexity difference') FROM lda_model ;
 SELECT assert(array_upper(perplexity_iters,1)  = 2, 'Perplexity iterations are different from expected') FROM lda_model ;
 SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
 -- Commenting the below flaky test to re-visit later.
 -- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


 -- Test for evaluate_every = 1  and 0 : In this case the iterations should not stop early --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     1,                       -- evaluate_every
     0                        -- perplexity_tol
     );

 select assert(num_iterations = 10, 'Perplexity should run for all the iterations') from lda_model ;


 -- Test for evaluate_every = NULL and perplexity_tol = NULL.  In this case it should not calculate perplexity --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     NULL,                    -- evaluate_every
     NULL                     -- perplexity_tol
     );

 select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
 select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;


 -- Test for evaluate_every = 1 and perplexity_tol = NULL. --
 -- In this case it should calculate perplexity with perplexity_tol = 0.1 as default value --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     1,                       -- evaluate_every
     NULL                     -- perplexity_tol
     );

 select assert(array_upper(perplexity_iters,1) >= 1, 'Perplexity iterations are different from expected') from lda_model ;
 select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;


 -- Test for evaluate_every = NULL and perplexity_tol != NULL --
 -- In this case it should not calculate perplexity --


 drop table if exists lda_model, lda_output_data;
 SELECT lda_train(
     'lda_training',          -- data_table
     'lda_model',             -- model_table
     'lda_output_data',       -- output_data_table
     20,                      -- voc_size
     5,                       -- topic_num
     10,                      -- iter_num
     10,                      -- alpha
     0.01,                    -- beta
     NULL,                    -- evaluate_every
     1                        -- perplexity_tol
     );

 select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
 select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;
	---------------------------------------------------------------------------
	-- Rules:
	-- ------
	-- 1) Any DB objects should be created w/o schema prefix,
	-- since this file is executed in a separate schema context.
	-- 2) There should be no DROP statements in this script, since
	-- all objects created in the default schema will be cleaned-up outside.
	---------------------------------------------------------------------------
	m4_include(`SQLCommon.m4')

	---------------------------------------------------------------------------
	-- Build vocabulary:
	---------------------------------------------------------------------------
	CREATE TABLE lda_vocab(wordid INT4, word TEXT);

	INSERT INTO lda_vocab VALUES
	(0, 'code'), (1, 'data'), (2, 'graph'), (3, 'image'), (4, 'input'), (5,
	'layer'), (6, 'learner'), (7, 'loss'), (8, 'model'), (9, 'network'), (10,
	'neuron'), (11, 'object'), (12, 'output'), (13, 'rate'), (14, 'set'), (15,
	'signal'), (16, 'sparse'), (17, 'spatial'), (18, 'system'), (19, 'training');

	---------------------------------------------------------------------------
	-- Build training dataset:
	---------------------------------------------------------------------------
	CREATE TABLE lda_training
	(
	docid INT4,
	wordid INT4,
	count INT4
	);

	INSERT INTO lda_training VALUES
	(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
	1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
	14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
	1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
	2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
	11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
	1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
	1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
	1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
	16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
	(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
	19, 1);

	CREATE TABLE lda_training_odd_voc_size
	(
	docid INT4,
	wordid INT4,
	count INT4
	);

	INSERT INTO lda_training_odd_voc_size VALUES
	(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
	1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
	14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
	1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
	2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
	11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
	1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
	1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
	1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
	16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
	(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
	19, 1),(9, 20, 1);


	CREATE TABLE lda_testing
	(
	docid INT4,
	wordid INT4,
	count INT4
	);

	INSERT INTO lda_testing VALUES
	(0, 0, 2),(0, 8, 1),(0, 9, 1),(0, 10, 1),(0, 12, 1),(0, 15, 2),(0, 18, 1),(0,
	19, 1), (1, 0, 1),(1, 2, 1),(1, 5, 1),(1, 7, 1),(1, 12, 2),(1, 13, 1),(1, 16,
	1),(1, 17, 1),(1, 18, 1), (2, 0, 1),(2, 1, 1),(2, 2, 1),(2, 3, 1),(2, 4, 1),(2,
	5, 1),(2, 6, 1),(2, 12, 1),(2, 14, 1),(2, 18, 1), (3, 2, 2),(3, 6, 2),(3, 7,
	1),(3, 9, 1),(3, 11, 2),(3, 14, 1),(3, 15, 1), (4, 1, 1),(4, 2, 2),(4, 3,
	1),(4, 5, 2),(4, 6, 1),(4, 11, 1),(4, 18, 2);

	---------------------------------------------------------------------------
	-- Test
	---------------------------------------------------------------------------
	SELECT lda_train(
	'lda_training',
	'lda_model',
	'lda_output_data',
	20, 5, 2, 10, 0.01);


	SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
	FROM lda_model;



	SELECT
	assert(
	topic_num IS NOT NULL AND topic_num = 5,
	'topic_num should be 5!'
	),
	assert(
	sum_of_topic_counts IS NOT NULL AND sum_of_word_counts IS NOT NULL
	AND sum_of_topic_counts = sum_of_word_counts,
	'sum_of_topic_counts (' \|\| sum_of_topic_counts::text \|\| ') != sum_of_word_counts (' \|\| sum_of_word_counts::text \|\| ')!'
	)
	FROM
	(
	SELECT
	array_upper((parsed_model).total_topic_counts, 1) AS topic_num,
	array_sum((parsed_model).total_topic_counts) AS sum_of_topic_counts
	FROM
	(
	SELECT lda_parse_model(model, voc_size, topic_num) AS parsed_model
	FROM lda_model
	) subq_parsed_model
	) subq_topic_num,
	(
	SELECT sum(count) AS sum_of_word_counts FROM lda_training
	)subq_word_count;

	SELECT lda_predict(
	'lda_testing',
	'lda_model',
	'lda_pred');

	SELECT lda_predict(
	'lda_testing',
	'lda_model',
	'lda_pred_2',
	5);

	SELECT lda_get_perplexity(
	'lda_model',
	'lda_pred');

	SELECT __lda_util_index_sort(array[1, 4, 2, 3]);
	SELECT __lda_util_transpose(array[[1, 2, 3],[4, 5, 6]]);
	SELECT assert(count(*) = 2, 'Wrong answer: __lda_util_unnest_transpose()')
	FROM
	(
	SELECT __lda_util_unnest_transpose(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
	) subq;
	SELECT assert(count(*) = 3, 'Wrong answer: __lda_util_unnest()')
	FROM
	(
	SELECT __lda_util_unnest(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2)
	) subq;
	SELECT __lda_util_norm_with_smoothing(array[1, 4, 2, 3], 0.1);

	-- no words hit ceiling
	SELECT
	assert(
	__lda_check_count_ceiling(
	array[0, 0, 0, 0]::bigint[],
	2,
	2)
	IS NULL,
	'__lda_check_count_ceiling should return NULL for [0, 0, 0, 0]');
	-- length: 1
	SELECT
	assert(
	__lda_check_count_ceiling(
	array[-1, -1, -1, -1]::bigint[],
	2,
	2)
	IS NOT NULL,
	'__lda_check_count_ceiling should not return NULL for [-1, -1, -1, -1]');

	SELECT lda_get_topic_desc(
	'lda_model',
	'lda_vocab',
	'topic_word_desc',
	5);

	SELECT lda_get_topic_word_count(
	'lda_model',
	'topic_word_count');

	SELECT lda_get_word_topic_count(
	'lda_model',
	'topic_word_count_2');

	SELECT lda_get_word_topic_mapping(
	'lda_output_data',
	'word_topic_mapping');

	SELECT *
	FROM __lda_util_norm_vocab('lda_vocab', 'norm_lda_vocab');

	SELECT *
	FROM __lda_util_norm_dataset('lda_testing', 'norm_lda_vocab',
	'norm_lda_data');

	SELECT *
	FROM __lda_util_conorm_data('lda_testing', 'lda_vocab',
	'norm_lda_data_2', 'norm_lda_vocab_2');

	-- both voc_size and topic_num are odd or even
	SELECT lda_train(
	'lda_training_odd_voc_size',
	'lda_model_odd_voc_size_even_topic_num',
	'lda_output_odd_voc_size_even_topic_num',
	21, 6, 2, 3, 0.01);

	SELECT lda_train(
	'lda_training_odd_voc_size',
	'lda_model_odd_voc_size_odd_topic_num',
	'lda_output_odd_voc_size_odd_topic_num',
	21, 5, 2, 3, 0.01);

	SELECT lda_train(
	'lda_training',
	'lda_model_even_voc_size_even_topic_num',
	'lda_output_even_voc_size_even_topic_num',
	20, 6, 2, 3, 0.01);

	DROP TABLE IF EXISTS documents;
	CREATE TABLE documents(docid INT4, contents TEXT);
	INSERT INTO documents VALUES
	(0, 'b a a c'),
	(1, 'd e f f f');

	ALTER TABLE documents ADD COLUMN words TEXT[];
	UPDATE documents SET words = regexp_split_to_array(lower(contents), E'[\\s+\\.\\,]');

	DROP TABLE IF EXISTS my_training, my_training_vocabulary;
	SELECT term_frequency('documents', 'docid', 'words', 'my_training', TRUE);


	DROP TABLE IF EXISTS my_model, my_outdata;
	SELECT lda_train(
	'my_training',
	'my_model',
	'my_outdata',
	6, 2, 2, 3, 0.01);

	DROP TABLE IF EXISTS word_topic_count;
	SELECT lda_get_word_topic_count( 'my_model', 'word_topic_count');
	SELECT * FROM word_topic_count ORDER BY wordid;

	-- This function will validate that the output of lda_get_word_topic_count() is consistent with the output of lda_train
	CREATE OR REPLACE FUNCTION validate_lda_output() RETURNS integer AS $$
	DECLARE
	-- variables for looping through the word_topic_mapping table
	word_topic_mapping_row RECORD;
	word_topic_mapping_wordid int;
	word_topic_mapping_topicid int;

	-- we use the following array from word_topic_mapping to compare with the output of lda_get_word_topic_count()
	word_topic_mapping_array INT[6][2] := ARRAY[[0,0],[0,0],[0,0],[0,0],[0,0],[0,0]];
	word_topic_mapping_topic_count int[];

	-- variables for looping through the output of lda_get_word_topic_count()
	word_topic_count_row RECORD;
	word_topic_count_topic_count int[];
	BEGIN
	-- Create helper table from out_data
	DROP TABLE IF EXISTS word_topic_mapping;
	CREATE table word_topic_mapping as SELECT wordid, topicid FROM (
	SELECT unnest(svec_from_string('{' \|\| array_to_string(counts, ',') \|\| '}:{' \|\| array_to_string(words, ',') \|\| '}')::float[]) AS wordid,
	unnest(topic_assignment) AS topicid FROM my_outdata) a
	order by wordid ;

	-- Construct the mapping array
	FOR word_topic_mapping_row IN SELECT * FROM word_topic_mapping ORDER BY wordid LOOP
	-- we need to add 1 because postgres array starts from 1 and our wordid and topicid start from 0
	word_topic_mapping_wordid := word_topic_mapping_row.wordid+1;
	word_topic_mapping_topicid := word_topic_mapping_row.topicid+1;

	word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] := word_topic_mapping_array[word_topic_mapping_wordid][word_topic_mapping_topicid] + 1;

	END LOOP;

	-- Loop through the output of lda_get_word_topic_count() and compare the results with the helper table word_topic_mapping
	FOR word_topic_count_row IN select * from word_topic_count ORDER BY wordid LOOP
	word_topic_count_topic_count := ARRAY[word_topic_count_row.topic_count];
	word_topic_mapping_topic_count := word_topic_mapping_array[word_topic_count_row.wordid+1:word_topic_count_row.wordid+1]; -- Here arrayX[i:i] means the ith 1d array of a 2d array

	IF (word_topic_mapping_topic_count != word_topic_count_topic_count) THEN
	RAISE EXCEPTION 'Topic assignment for wordid % does not match. word_topic_mapping: % word_topic_count: %',word_topic_count_row.wordid, word_topic_mapping_topic_count, word_topic_count_topic_count;
	END IF;
	END LOOP;

	RETURN 1;
	END;
	$$ LANGUAGE plpgsql;

	select validate_lda_output();


	---------- TEST CASES FOR PERPLEXITY ----------

	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	2, -- iter_num
	10, -- alpha
	0.01, -- beta
	2, -- evaluate_every
	.2); -- perplexity_tol

	SELECT assert(perplexity_iters = '{2}', 'Number of Perplexity iterations are wrong') FROM lda_model;
	SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
	-- Commenting the below flaky test to re-visit later.
	-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	3, -- iter_num
	10, -- alpha
	0.01, -- beta
	1, -- evaluate_every
	.1 -- perplexity_tol
	);

	SELECT assert(array_upper(perplexity,1) = 3, 'Perplexity calculation is wrong') FROM lda_model;
	SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
	-- Commenting the below flaky test to re-visit later.
	-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


	-- Function to check if the perplexity value returned from the function
	-- and calculated by the train funcion are same.
	CREATE OR REPLACE FUNCTION validate_perplexity() RETURNS boolean AS $$

	DECLARE
	perplexity_from_func Double precision[];
	perplexity_lda_train Double precision[];

	BEGIN
	drop table if exists lda_model, lda_output_data;
	PERFORM lda_train(
	'lda_training',
	'lda_model',
	'lda_output_data',
	20, 5, 2, 10, 0.01, 2, .2);

	SELECT array_agg(round(lda_get_perplexity::numeric,10)) INTO perplexity_from_func from lda_get_perplexity('lda_model','lda_output_data');

	select perplexity INTO perplexity_lda_train from lda_model ;


	if perplexity_lda_train != perplexity_from_func THEN
	return FALSE;
	ELSE
	return TRUE;
	END IF;

	END;

	$$ LANGUAGE plpgsql;

	SELECT assert(validate_perplexity() = TRUE, 'Perplexity calculation is wrong');
	SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
	-- Commenting the below flaky test to re-visit later.
	-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;

	-- Test for evaluate_every = Number of iterations = 1. It should give exactly one perplexity value --


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	1, -- iter_num
	10, -- alpha
	0.01, -- beta
	1, -- evaluate_every
	.1 -- perplexity_tol
	);

	select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;
	select assert(array_upper(perplexity,1) = 1, 'Perplexity should not have more than 1 value') from lda_model;


	-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity--



	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	1, -- iter_num
	10, -- alpha
	0.01, -- beta
	0, -- evaluate_every
	.1 -- perplexity_tol
	);

	select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model;
	select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;

	-- Test for evaluate_every = 0 and -1 : In this do not calculate perplexity--



	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	1, -- iter_num
	10, -- alpha
	0.01, -- beta
	-1, -- evaluate_every
	.1 -- perplexity_tol
	);

	select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
	select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;


	-- Test to check if the perplexity_iters are matching the expected value --

	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	2, -- evaluate_every
	.1 -- perplexity_tol
	);

	SELECT assert(array_upper(perplexity_iters,1) <= 5, 'Perplexity iterations are different from expected') FROM lda_model ;
	SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
	-- Commenting the below flaky test to re-visit later.
	-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


	-- Test: If the difference between any two iterations is less than the perplexity_tol, we will stop the training. --
	-- In this case, it will iterate to two iterations only as the perplexity_tol is very large and the difference between the 2 and 4th iteration --
	-- will be less than 10, so Only 2 iterations will be recorded --


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	2, -- evaluate_every
	100 -- perplexity_tol
	);


	SELECT assert(abs(perplexity[2] - perplexity[1]) <100, 'Perplexity tol is less than the perplexity difference') FROM lda_model ;
	SELECT assert(array_upper(perplexity_iters,1) = 2, 'Perplexity iterations are different from expected') FROM lda_model ;
	SELECT assert(perplexity[1] > 0 , 'Perplexity value should be greate than 0') FROM lda_model ;
	-- Commenting the below flaky test to re-visit later.
	-- select assert(array_upper(ARRAY(Select distinct unnest(perplexity)),1)= array_upper(perplexity,1) , 'Perplexity values should be unique') from lda_model ;


	-- Test for evaluate_every = 1 and 0 : In this case the iterations should not stop early --



	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	1, -- evaluate_every
	0 -- perplexity_tol
	);

	select assert(num_iterations = 10, 'Perplexity should run for all the iterations') from lda_model ;


	-- Test for evaluate_every = NULL and perplexity_tol = NULL. In this case it should not calculate perplexity --


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	NULL, -- evaluate_every
	NULL -- perplexity_tol
	);

	select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
	select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;


	-- Test for evaluate_every = 1 and perplexity_tol = NULL. --
	-- In this case it should calculate perplexity with perplexity_tol = 0.1 as default value --


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	1, -- evaluate_every
	NULL -- perplexity_tol
	);

	select assert(array_upper(perplexity_iters,1) >= 1, 'Perplexity iterations are different from expected') from lda_model ;
	select assert(perplexity != '{}', 'Perplexity should be calculated') from lda_model;


	-- Test for evaluate_every = NULL and perplexity_tol != NULL --
	-- In this case it should not calculate perplexity --


	drop table if exists lda_model, lda_output_data;
	SELECT lda_train(
	'lda_training', -- data_table
	'lda_model', -- model_table
	'lda_output_data', -- output_data_table
	20, -- voc_size
	5, -- topic_num
	10, -- iter_num
	10, -- alpha
	0.01, -- beta
	NULL, -- evaluate_every
	1 -- perplexity_tol
	);

	select assert(perplexity = '{}', 'Perplexity should not be calculated') from lda_model ;
	select assert(perplexity_iters = '{}', 'Perplexity iterations should be null') from lda_model ;