blob: b71de16970c052ce581b20cf8a4a4e1c7e3ff732 [file] [log] [blame]
CREATE EXTENSION gp_sparse_vector;
SET search_path TO sparse_vector;
DROP TABLE if exists test;
CREATE TABLE test (a int, b svec) DISTRIBUTED BY (a);
INSERT INTO test (SELECT 1,gp_extract_feature_histogram('{"one","two","three","four","five","six"}','{"twe","four","five","six","one","three","two","one"}'));
INSERT INTO test (SELECT 2,gp_extract_feature_histogram('{"one","two","three","four","five","six"}','{"the","brown","cat","ran","across","three","dogs"}'));
INSERT INTO test (SELECT 3,gp_extract_feature_histogram('{"one","two","three","four","five","six"}','{"two","four","five","six","one","three","two","one"}'));
-- Test the equals operator (should be only 3 rows)
SELECT a,b::float8[] cross_product_equals FROM (SELECT a,b FROM test) foo WHERE b = foo.b ORDER BY a;
DROP TABLE IF EXISTS test2;
CREATE TABLE test2 AS SELECT * FROM test DISTRIBUTED BY (a);
-- Test the plus operator (should be 9 rows)
SELECT (t1.b+t2.b)::float8[] cross_product_sum FROM test t1, test2 t2;
-- Test ORDER BY
SELECT (t1.b+t2.b)::float8[] cross_product_sum, l2norm(t1.b+t2.b) l2norm, (t1.b+t2.b) sparse_vector FROM test t1, test2 t2 ORDER BY 3;
SELECT (sum(t1.b))::float8[] AS features_sum FROM test t1;
-- Test the div operator
SELECT (t1.b/(SELECT sum(b) FROM test))::float8[] AS weights FROM test t1 ORDER BY a;
-- Test the * operator
SELECT t1.b %*% (t1.b/(SELECT sum(b) FROM test)) AS raw_score FROM test t1 ORDER BY a;
-- Test the * and l2norm operators
SELECT (t1.b %*% (t1.b/(SELECT sum(b) FROM test))) / (l2norm(t1.b) * l2norm((SELECT sum(b) FROM test))) AS norm_score FROM test t1 ORDER BY a;
-- Test the ^ and l1norm operators
SELECT ('{1,2}:{20.,10.}'::svec)^('{1}:{3.}'::svec);
SELECT (t1.b %*% (t1.b/(SELECT sum(b) FROM test))) / (l1norm(t1.b) * l1norm((SELECT sum(b) FROM test))) AS norm_score FROM test t1 ORDER BY a;
-- Test the multi-concatenation and show sizes compared with a normal array
DROP TABLE IF EXISTS corpus_proj;
DROP TABLE IF EXISTS corpus_proj_array;
CREATE TABLE corpus_proj AS (SELECT 10000 *|| ('{45,2,35,4,15,1}:{0,1,0,1,0,2}'::svec) result ) distributed randomly;
CREATE TABLE corpus_proj_array AS (SELECT result::float8[] FROM corpus_proj) distributed randomly;
-- Calculate on-disk size of sparse vector
SELECT pg_size_pretty(pg_total_relation_size('corpus_proj'));
-- Calculate on-disk size of normal array
SELECT pg_size_pretty(pg_total_relation_size('corpus_proj_array'));
-- Calculate L1 norm from sparse vector
SELECT l1norm(result) FROM corpus_proj;
-- Calculate L1 norm from float8[]
SELECT l1norm(result) FROM corpus_proj_array;
-- Calculate L2 norm from sparse vector
SELECT l2norm(result) FROM corpus_proj;
-- Calculate L2 norm from float8[]
SELECT l2norm(result) FROM corpus_proj_array;
DROP TABLE corpus_proj;
DROP TABLE corpus_proj_array;
DROP TABLE test;
DROP TABLE test2;
-- Test operators between svec and float8[]
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec) %*% ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[];
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[] %*% ('{1,2,3,4}:{3,4,5,6}'::svec);
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec) / ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[];
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[] / ('{1,2,3,4}:{3,4,5,6}'::svec);
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec) * ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[];
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[] * ('{1,2,3,4}:{3,4,5,6}'::svec);
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec) + ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[];
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[] + ('{1,2,3,4}:{3,4,5,6}'::svec);
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec) - ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[];
SELECT ('{1,2,3,4}:{3,4,5,6}'::svec)::float8[] - ('{1,2,3,4}:{3,4,5,6}'::svec);
-- Test the pivot operator in the presence of NULL values
DROP TABLE IF EXISTS pivot_test;
CREATE TABLE pivot_test(a float8) distributed randomly;
INSERT INTO pivot_test VALUES (0),(1),(NULL),(2),(3);
SELECT l1norm(array_agg(a)) FROM pivot_test;
DROP TABLE IF EXISTS pivot_test;
-- Answer should be 5
SELECT vec_median(array_agg(a)) FROM (SELECT generate_series(1,9) a) foo;
-- Answer should be a 10-wide vector
SELECT array_agg(a) FROM (SELECT trunc(7) a,generate_series(1,100000) ORDER BY a) foo;
-- Average is 4.50034, median is 5
SELECT vec_median('{9960,9926,10053,9993,10080,10050,9938,9941,10030,10029}:{1,9,8,7,6,5,4,3,2,0}'::svec);
SELECT vec_median('{9960,9926,10053,9993,10080,10050,9938,9941,10030,10029}:{1,9,8,7,6,5,4,3,2,0}'::svec::float8[]);
-- Test operator == as a joining condition when this column could be null. (See issue #12986)
SELECT DISTINCT a.table_name, a.character_maximum_length
FROM information_schema.columns a INNER JOIN information_schema.columns b ON a.table_name=b.table_name AND a.column_name=b.column_name
WHERE a.character_maximum_length == b.character_maximum_length ORDER BY a.table_name;
DROP EXTENSION gp_sparse_vector;
RESET search_path;