src/ports/postgres/modules/utilities/test/text_utilities.ic.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *//* ----------------------------------------------------------------------- */

 ---------------------------------------------------------------------------
 -- Build training dataset:
 ---------------------------------------------------------------------------
 DROP TABLE IF EXISTS training;
 CREATE TABLE training
 (
     docid INT4,
     wordid INT4,
     count INT4
 );

 INSERT INTO training VALUES
 (0, 0, 2),(0, 3, 2),(0, 5, 1),(9, 18, 1),(9, 19, 1);


 DROP TABLE IF EXISTS documents;
 CREATE TABLE documents AS
 SELECT docid, array_agg(wordid) as words
 FROM (
       SELECT docid, wordid, generate_series(1, count)
       FROM training
 ) q
 GROUP BY docid;

 DROP TABLE IF EXISTS output_terms;
 DROP TABLE IF EXISTS output_terms_vocabulary;
 SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);
	/* ----------------------------------------------------------------------- //*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	// ----------------------------------------------------------------------- */

	---------------------------------------------------------------------------
	-- Build training dataset:
	---------------------------------------------------------------------------
	DROP TABLE IF EXISTS training;
	CREATE TABLE training
	(
	docid INT4,
	wordid INT4,
	count INT4
	);

	INSERT INTO training VALUES
	(0, 0, 2),(0, 3, 2),(0, 5, 1),(9, 18, 1),(9, 19, 1);


	DROP TABLE IF EXISTS documents;
	CREATE TABLE documents AS
	SELECT docid, array_agg(wordid) as words
	FROM (
	SELECT docid, wordid, generate_series(1, count)
	FROM training
	) q
	GROUP BY docid;

	DROP TABLE IF EXISTS output_terms;
	DROP TABLE IF EXISTS output_terms_vocabulary;
	SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);