blob: d8a1e412485d538c3fe2ba3aee75bc0cdc30c608 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */
---------------------------------------------------------------------------
-- Build training dataset:
---------------------------------------------------------------------------
DROP TABLE IF EXISTS training;
CREATE TABLE training
(
docid INT4,
wordid INT4,
count INT4
);
INSERT INTO training VALUES
(0, 0, 2),(0, 3, 2),(0, 5, 1),(9, 18, 1),(9, 19, 1);
DROP TABLE IF EXISTS documents;
CREATE TABLE documents AS
SELECT docid, array_agg(wordid) as words
FROM (
SELECT docid, wordid, generate_series(1, count)
FROM training
) q
GROUP BY docid;
DROP TABLE IF EXISTS output_terms;
DROP TABLE IF EXISTS output_terms_vocabulary;
SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);