| --------------------------------------------------------------------------- |
| -- Build training dataset: |
| --------------------------------------------------------------------------- |
| DROP TABLE IF EXISTS training; |
| CREATE TABLE training |
| ( |
| docid INT4, |
| wordid INT4, |
| count INT4 |
| ); |
| |
| INSERT INTO training VALUES |
| (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13, |
| 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1, |
| 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13, |
| 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17, |
| 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4, |
| 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7, |
| 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3, |
| 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0, |
| 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7, |
| 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1), |
| (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9, |
| 19, 1); |
| |
| |
| DROP TABLE IF EXISTS documents; |
| CREATE TABLE documents AS |
| SELECT docid, array_agg(wordid) as words |
| FROM ( |
| SELECT docid, wordid, generate_series(1, count) |
| FROM training |
| ) q |
| GROUP BY docid; |
| |
| DROP TABLE IF EXISTS output_terms; |
| DROP TABLE IF EXISTS output_terms_vocabulary; |
| SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE); |
| |
| SELECT assert(c = 81, 'Incorrect results for term frequency') |
| FROM ( |
| SELECT count(*) as c |
| FROM ( |
| SELECT (q1.count - q2.count) as diff |
| FROM training q1, output_terms q2 |
| WHERE q1.docid = q2.docid and q1.wordid = q2.word::INTEGER |
| ) s |
| WHERE diff = 0 |
| ) subq; |
| |
| DROP TABLE IF EXISTS output_terms; |
| DROP TABLE IF EXISTS output_terms_vocabulary; |
| SELECT term_frequency('documents', 'docid', 'words', 'output_terms', TRUE); |
| |
| SELECT * FROM output_terms; |
| SELECT * FROM output_terms_vocabulary; |
| |
| |
| |
| |