| --------------------------------------------------------------------------- |
| -- Rules: |
| -- ------ |
| -- 1) Any DB objects should be created w/o schema prefix, |
| -- since this file is executed in a separate schema context. |
| -- 2) There should be no DROP statements in this script, since |
| -- all objects created in the default schema will be cleaned-up outside. |
| --------------------------------------------------------------------------- |
| m4_include(`SQLCommon.m4') |
| |
| --------------------------------------------------------------------------- |
| -- Build vocabulary: |
| --------------------------------------------------------------------------- |
| CREATE TABLE lda_vocab(wordid INT4, word TEXT); |
| |
| INSERT INTO lda_vocab VALUES |
| (0, 'code'), (1, 'data'), (2, 'graph'), (3, 'image'), (4, 'input'), (5, |
| 'layer'), (6, 'learner'), (7, 'loss'), (8, 'model'), (9, 'network'), (10, |
| 'neuron'), (11, 'object'), (12, 'output'), (13, 'rate'), (14, 'set'), (15, |
| 'signal'), (16, 'sparse'), (17, 'spatial'), (18, 'system'), (19, 'training'); |
| |
| --------------------------------------------------------------------------- |
| -- Build training dataset: |
| --------------------------------------------------------------------------- |
| CREATE TABLE lda_training |
| ( |
| docid INT4, |
| wordid INT4, |
| count INT4 |
| ); |
| |
| INSERT INTO lda_training VALUES |
| (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13, |
| 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1, |
| 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13, |
| 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17, |
| 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4, |
| 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7, |
| 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3, |
| 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0, |
| 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7, |
| 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1), |
| (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9, |
| 19, 1); |
| |
| CREATE TABLE lda_training_odd_voc_size |
| ( |
| docid INT4, |
| wordid INT4, |
| count INT4 |
| ); |
| |
| INSERT INTO lda_training_odd_voc_size VALUES |
| (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13, |
| 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1, |
| 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13, |
| 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17, |
| 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4, |
| 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7, |
| 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3, |
| 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0, |
| 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7, |
| 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1), |
| (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9, |
| 19, 1),(9, 20, 1); |
| |
| |
| CREATE TABLE lda_testing |
| ( |
| docid INT4, |
| wordid INT4, |
| count INT4 |
| ) |
| m4_ifdef(`__GREENPLUM__',`DISTRIBUTED BY (docid)'); |
| |
| INSERT INTO lda_testing VALUES |
| (0, 0, 2),(0, 8, 1),(0, 9, 1),(0, 10, 1),(0, 12, 1),(0, 15, 2),(0, 18, 1),(0, |
| 19, 1), (1, 0, 1),(1, 2, 1),(1, 5, 1),(1, 7, 1),(1, 12, 2),(1, 13, 1),(1, 16, |
| 1),(1, 17, 1),(1, 18, 1), (2, 0, 1),(2, 1, 1),(2, 2, 1),(2, 3, 1),(2, 4, 1),(2, |
| 5, 1),(2, 6, 1),(2, 12, 1),(2, 14, 1),(2, 18, 1), (3, 2, 2),(3, 6, 2),(3, 7, |
| 1),(3, 9, 1),(3, 11, 2),(3, 14, 1),(3, 15, 1), (4, 1, 1),(4, 2, 2),(4, 3, |
| 1),(4, 5, 2),(4, 6, 1),(4, 11, 1),(4, 18, 2); |
| |
| --------------------------------------------------------------------------- |
| -- Test |
| --------------------------------------------------------------------------- |
| SELECT lda_train( |
| 'lda_training', |
| 'lda_model', |
| 'lda_output_data', |
| 20, 5, 2, 10, 0.01); |
| |
| SELECT lda_predict( |
| 'lda_testing', |
| 'lda_model', |
| 'lda_pred'); |
| |
| SELECT lda_predict( |
| 'lda_testing', |
| 'lda_model', |
| 'lda_pred_2', |
| 5); |
| |
| SELECT lda_get_perplexity( |
| 'lda_model', |
| 'lda_pred'); |
| |
| SELECT __lda_util_index_sort(array[1, 4, 2, 3]); |
| SELECT __lda_util_transpose(array[[1, 2, 3],[4, 5, 6]]); |
| SELECT assert(count(*) = 2, 'Wrong answer: __lda_util_unnest_transpose()') |
| FROM |
| ( |
| SELECT __lda_util_unnest_transpose(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2) |
| ) subq; |
| SELECT assert(count(*) = 3, 'Wrong answer: __lda_util_unnest()') |
| FROM |
| ( |
| SELECT __lda_util_unnest(array[1, 2, 0, 4, 5, 0]::bigint[], 3, 2) |
| ) subq; |
| SELECT __lda_util_norm_with_smoothing(array[1, 4, 2, 3], 0.1); |
| |
| -- no words hit ceiling |
| SELECT |
| assert( |
| __lda_check_count_ceiling( |
| array[0, 0, 0, 0]::bigint[], |
| 2, |
| 2) |
| IS NULL, |
| '__lda_check_count_ceiling should return NULL for [0, 0, 0, 0]'); |
| -- length: 1 |
| SELECT |
| assert( |
| __lda_check_count_ceiling( |
| array[-1, -1, -1, -1]::bigint[], |
| 2, |
| 2) |
| IS NOT NULL, |
| '__lda_check_count_ceiling should not return NULL for [-1, -1, -1, -1]'); |
| |
| SELECT lda_get_topic_desc( |
| 'lda_model', |
| 'lda_vocab', |
| 'topic_word_desc', |
| 5); |
| |
| SELECT lda_get_topic_word_count( |
| 'lda_model', |
| 'topic_word_count'); |
| |
| SELECT lda_get_word_topic_count( |
| 'lda_model', |
| 'topic_word_count_2'); |
| |
| SELECT * |
| FROM __lda_util_norm_vocab('lda_vocab', 'norm_lda_vocab'); |
| |
| SELECT * |
| FROM __lda_util_norm_dataset('lda_testing', 'norm_lda_vocab', |
| 'norm_lda_data'); |
| |
| SELECT * |
| FROM __lda_util_conorm_data('lda_testing', 'lda_vocab', |
| 'norm_lda_data_2', 'norm_lda_vocab_2'); |
| |
| -- both voc_size and topic_num are odd or even |
| SELECT lda_train( |
| 'lda_training_odd_voc_size', |
| 'lda_model_odd_voc_size_even_topic_num', |
| 'lda_output_odd_voc_size_even_topic_num', |
| 21, 6, 2, 3, 0.01); |
| |
| SELECT lda_train( |
| 'lda_training_odd_voc_size', |
| 'lda_model_odd_voc_size_odd_topic_num', |
| 'lda_output_odd_voc_size_odd_topic_num', |
| 21, 5, 2, 3, 0.01); |
| |
| SELECT lda_train( |
| 'lda_training', |
| 'lda_model_even_voc_size_even_topic_num', |
| 'lda_output_even_voc_size_even_topic_num', |
| 20, 6, 2, 3, 0.01); |