blob: a4883a6d426a206e8aaa07e073abc6556de77226 [file] [log] [blame]
---------------------------------------------------------------------------
-- Build training dataset:
---------------------------------------------------------------------------
DROP TABLE IF EXISTS training;
CREATE TABLE training
(
docid INT4,
wordid INT4,
count INT4
);
INSERT INTO training VALUES
(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
19, 1);
DROP TABLE IF EXISTS documents;
CREATE TABLE documents AS
SELECT docid, array_agg(wordid) as words
FROM (
SELECT docid, wordid, generate_series(1, count)
FROM training
) q
GROUP BY docid;
DROP TABLE IF EXISTS output_terms;
DROP TABLE IF EXISTS output_terms_vocabulary;
SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);
SELECT assert(c = 81, 'Incorrect results for term frequency')
FROM (
SELECT count(*) as c
FROM (
SELECT (q1.count - q2.count) as diff
FROM training q1, output_terms q2
WHERE q1.docid = q2.docid and q1.wordid = q2.word::INTEGER
) s
WHERE diff = 0
) subq;
DROP TABLE IF EXISTS output_terms;
DROP TABLE IF EXISTS output_terms_vocabulary;
SELECT term_frequency('documents', 'docid', 'words', 'output_terms', TRUE);
SELECT * FROM output_terms;
SELECT * FROM output_terms_vocabulary;