src/ports/postgres/modules/utilities/test/text_utilities.sql_in - madlib - Git at Google

 ---------------------------------------------------------------------------
 -- Build training dataset:
 ---------------------------------------------------------------------------
 DROP TABLE IF EXISTS training;
 CREATE TABLE training
 (
     docid INT4,
     wordid INT4,
     count INT4
 );

 INSERT INTO training VALUES
 (0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
 1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
 14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
 1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
 2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
 11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
 1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
 1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
 1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
 16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
 (9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
 19, 1);


 DROP TABLE IF EXISTS documents;
 CREATE TABLE documents AS
 SELECT docid, array_agg(wordid) as words
 FROM (
       SELECT docid, wordid, generate_series(1, count)
       FROM training
 ) q
 GROUP BY docid;

 DROP TABLE IF EXISTS output_terms;
 DROP TABLE IF EXISTS output_terms_vocabulary;
 SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);

 SELECT assert(c = 81, 'Incorrect results for term frequency')
 FROM (
     SELECT count(*) as c
     FROM (
         SELECT (q1.count - q2.count) as diff
         FROM training q1, output_terms q2
         WHERE q1.docid = q2.docid and q1.wordid = q2.word::INTEGER
         ) s
     WHERE diff = 0
 ) subq;

 DROP TABLE IF EXISTS output_terms;
 DROP TABLE IF EXISTS output_terms_vocabulary;
 SELECT term_frequency('documents', 'docid', 'words', 'output_terms', TRUE);

 SELECT * FROM output_terms;
 SELECT * FROM output_terms_vocabulary;
	---------------------------------------------------------------------------
	-- Build training dataset:
	---------------------------------------------------------------------------
	DROP TABLE IF EXISTS training;
	CREATE TABLE training
	(
	docid INT4,
	wordid INT4,
	count INT4
	);

	INSERT INTO training VALUES
	(0, 0, 2),(0, 3, 2),(0, 5, 1),(0, 7, 1),(0, 8, 1),(0, 9, 1),(0, 11, 1),(0, 13,
	1), (1, 0, 1),(1, 3, 1),(1, 4, 1),(1, 5, 1),(1, 6, 1),(1, 7, 1),(1, 10, 1),(1,
	14, 1),(1, 17, 1),(1, 18, 1), (2, 4, 2),(2, 5, 1),(2, 6, 2),(2, 12, 1),(2, 13,
	1),(2, 15, 1),(2, 18, 2), (3, 0, 1),(3, 1, 2),(3, 12, 3),(3, 16, 1),(3, 17,
	2),(3, 19, 1), (4, 1, 1),(4, 2, 1),(4, 3, 1),(4, 5, 1),(4, 6, 1),(4, 10, 1),(4,
	11, 1),(4, 14, 1),(4, 18, 1),(4, 19, 1), (5, 0, 1),(5, 2, 1),(5, 5, 1),(5, 7,
	1),(5, 10, 1),(5, 12, 1),(5, 16, 1),(5, 18, 1),(5, 19, 2), (6, 1, 1),(6, 3,
	1),(6, 12, 2),(6, 13, 1),(6, 14, 2),(6, 15, 1),(6, 16, 1),(6, 17, 1), (7, 0,
	1),(7, 2, 1),(7, 4, 1),(7, 5, 1),(7, 7, 2),(7, 8, 1),(7, 11, 1),(7, 14, 1),(7,
	16, 1), (8, 2, 1),(8, 4, 4),(8, 6, 2),(8, 11, 1),(8, 15, 1),(8, 18, 1),
	(9, 0, 1),(9, 1, 1),(9, 4, 1),(9, 9, 2),(9, 12, 2),(9, 15, 1),(9, 18, 1),(9,
	19, 1);


	DROP TABLE IF EXISTS documents;
	CREATE TABLE documents AS
	SELECT docid, array_agg(wordid) as words
	FROM (
	SELECT docid, wordid, generate_series(1, count)
	FROM training
	) q
	GROUP BY docid;

	DROP TABLE IF EXISTS output_terms;
	DROP TABLE IF EXISTS output_terms_vocabulary;
	SELECT term_frequency('documents', 'docid', 'words', 'output_terms', FALSE);

	SELECT assert(c = 81, 'Incorrect results for term frequency')
	FROM (
	SELECT count(*) as c
	FROM (
	SELECT (q1.count - q2.count) as diff
	FROM training q1, output_terms q2
	WHERE q1.docid = q2.docid and q1.wordid = q2.word::INTEGER
	) s
	WHERE diff = 0
	) subq;

	DROP TABLE IF EXISTS output_terms;
	DROP TABLE IF EXISTS output_terms_vocabulary;
	SELECT term_frequency('documents', 'docid', 'words', 'output_terms', TRUE);

	SELECT * FROM output_terms;
	SELECT * FROM output_terms_vocabulary;