blob: 31f8845bf98c567c2dc8d0acd3475ddef92ae485 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file svec_util.sql_in
*
* @brief SQL utility functions for sparse vector data type
* <tt>svec</tt>
*
* @sa For an introduction to the sparse vector implementation, see the module
* description \ref grp_svec.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
--! Creates sparse vector representation given an array of indexes, respective values and
--! size of the required vector. The function just does the bucket filling with the values
--! at the respective indexes for the vector given the length.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_svec(BIGINT[], FLOAT8[], BIGINT)
RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'generate_sparse_vector'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
--! Creates the output table containing the sparse vector representation for the documents
--! given the dictionary table, documents tables and names of the respective columns.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gen_doc_svecs(
output_tbl TEXT,
dictionary_tbl TEXT,
dict_id_col TEXT,
dict_term_col TEXT,
documents_tbl TEXT,
doc_id_col TEXT,
doc_term_col TEXT,
doc_term_info_col TEXT
) RETURNS TEXT AS $$
PythonFunction(svec_util, generate_svec, generate_doc_svecs)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
--! Helper function for MADLIB_SCHEMA.gen_doc_svec UDF.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gen_doc_svecs()
RETURNS TEXT AS $$
PythonFunction(svec_util, generate_svec, generate_doc_svecs_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');
--! Basic floating point scalar operator: MIN.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_dmin(float8,float8) RETURNS float8 AS 'MODULE_PATHNAME', 'float8_min' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Basic floating point scalar operator: MAX.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_dmax(float8,float8) RETURNS float8 AS 'MODULE_PATHNAME', 'float8_max' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Counts the number of non-zero entries in the input vector; the second argument is capped at 1, then added to the first; used as the sfunc in the svec_count_nonzero() aggregate below.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_count(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec) RETURNS MADLIB_SCHEMA.svec
AS 'MODULE_PATHNAME', 'svec_count' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the logarithm of each element of the input SVEC.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_log(MADLIB_SCHEMA.svec) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_log' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Returns true if two SVECs are equal, not counting zeros (zero equals anything). If the two SVECs are of different size, then the function essentially zero-pads the shorter one and performs the comparison.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_eq_non_zero(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec) RETURNS boolean AS 'MODULE_PATHNAME', 'svec_eq_non_zero' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Returns true if left svec contains right one, meaning that every non-zero value in the right svec equals left one
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_contains(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec) RETURNS boolean AS 'MODULE_PATHNAME', 'svec_contains' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l2norm of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_l2norm(MADLIB_SCHEMA.svec) RETURNS float8 AS 'MODULE_PATHNAME', 'svec_l2norm' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l2norm of a float8 array.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_l2norm(float8[]) RETURNS float8 AS 'MODULE_PATHNAME', 'float8arr_l2norm' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l2norm distance between two SVECs.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.l2norm(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec)
RETURNS float8 AS 'MODULE_PATHNAME', 'svec_svec_l2norm' LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l1norm distance between two SVECs.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.l1norm(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec)
RETURNS float8 AS 'MODULE_PATHNAME', 'svec_svec_l1norm' LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l1norm of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_l1norm(MADLIB_SCHEMA.svec) RETURNS float8 AS 'MODULE_PATHNAME', 'svec_l1norm' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the l1norm of a float8 array.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_l1norm(float8[]) RETURNS float8 AS 'MODULE_PATHNAME', 'float8arr_l1norm' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the angle between two SVECs in radians.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.angle(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec)
RETURNS float8 AS 'MODULE_PATHNAME', 'svec_svec_angle' LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the Tanimoto distance between two SVECs.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tanimoto_distance(MADLIB_SCHEMA.svec,MADLIB_SCHEMA.svec)
RETURNS float8 AS 'MODULE_PATHNAME', 'svec_svec_tanimoto_distance' LANGUAGE C STRICT IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Unnests an SVEC into a table of uncompressed values
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_unnest(MADLIB_SCHEMA.svec) RETURNS setof float8 AS 'MODULE_PATHNAME', 'svec_unnest' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Appends an element to the back of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_pivot(MADLIB_SCHEMA.svec,float8) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_pivot' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Sums the elements of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_elsum(MADLIB_SCHEMA.svec) RETURNS float8 AS 'MODULE_PATHNAME', 'svec_summate' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Sums the elements of a float8 array.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_elsum(float8[]) RETURNS float8 AS 'MODULE_PATHNAME', 'float8arr_summate' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the median element of a float8 array.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_median(float8[]) RETURNS float8 AS 'MODULE_PATHNAME', 'float8arr_median' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the median element of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_median(MADLIB_SCHEMA.svec) RETURNS float8 AS 'MODULE_PATHNAME', 'svec_median' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Compares an SVEC to a float8, and returns positions of all elements not equal to the float as an array. Element index here starts at 0.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_nonbase_positions(MADLIB_SCHEMA.svec, FLOAT8) RETURNS INT8[] AS 'MODULE_PATHNAME', 'svec_nonbase_positions' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Compares an SVEC to a float8, and returns values of all elements not equal to the float as an array.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_nonbase_values(MADLIB_SCHEMA.svec, FLOAT8) RETURNS FLOAT8[] AS 'MODULE_PATHNAME', 'svec_nonbase_values' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Returns the dimension of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_dimension(MADLIB_SCHEMA.svec) RETURNS integer AS 'MODULE_PATHNAME', 'svec_dimension' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Applies a given function to each element of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_lapply(text,MADLIB_SCHEMA.svec) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_lapply' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Appends a run-length block to the back of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_append(MADLIB_SCHEMA.svec,float8,int8) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_append' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Projects onto an element of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_proj(MADLIB_SCHEMA.svec,int4) RETURNS float8 AS 'MODULE_PATHNAME', 'svec_proj' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Extracts a subvector of an SVEC given the subvector's start and end indices.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_subvec(MADLIB_SCHEMA.svec,int4,int4) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_subvec' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Reverses the elements of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_reverse(MADLIB_SCHEMA.svec) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_reverse' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Replaces the subvector of a given SVEC at a given start index with another SVEC. Note that element index should start at 1.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_change(MADLIB_SCHEMA.svec,int4,MADLIB_SCHEMA.svec) RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_change' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the hash of an SVEC.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_hash(MADLIB_SCHEMA.svec) RETURNS int4 AS 'MODULE_PATHNAME', 'svec_hash' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Computes the word-occurence vector of a document
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_sfv(text[], text[]) RETURNS MADLIB_SCHEMA.svec AS
'MODULE_PATHNAME', 'gp_extract_feature_histogram' LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Sorts an array of texts. This function should be in MADlib common.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_sort(text[]) RETURNS text[] AS $$
SELECT array(SELECT unnest($1::text[]) ORDER BY 1);
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
--! Converts an svec to a text string
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_to_string(MADLIB_SCHEMA.svec)
RETURNS text AS 'MODULE_PATHNAME', 'svec_to_string' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Converts a text string to an svec
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_from_string(text)
RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_from_string' STRICT LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Transition function for mean(svec) aggregate
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_mean_transition( FLOAT[], MADLIB_SCHEMA.svec)
RETURNS FLOAT[] AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Preliminary merge function for mean(svec) aggregate
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_mean_prefunc( FLOAT[], FLOAT[])
RETURNS FLOAT[] AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Final function for mean(svec) aggregate
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svec_mean_final( FLOAT[])
RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
--! Aggregate that computes the element-wise mean of a list of vectors.
--!
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mean( MADLIB_SCHEMA.svec) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA.mean( MADLIB_SCHEMA.svec) (
SFUNC = MADLIB_SCHEMA.svec_mean_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc = MADLIB_SCHEMA.svec_mean_prefunc,')
FINALFUNC = MADLIB_SCHEMA.svec_mean_final,
STYPE = FLOAT[]
);
--! Aggregate that provides the element-wise sum of a list of vectors.
--!
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_sum(MADLIB_SCHEMA.svec);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_sum (MADLIB_SCHEMA.svec) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA.svec_sum (MADLIB_SCHEMA.svec) (
SFUNC = MADLIB_SCHEMA.svec_plus,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.svec_plus,')
INITCOND = '{1}:{0.}', -- Zero
STYPE = MADLIB_SCHEMA.svec
);
--! Aggregate that provides a tally of nonzero entries in a list of vectors.
--!
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_count_nonzero(MADLIB_SCHEMA.svec);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_count_nonzero (MADLIB_SCHEMA.svec) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA.svec_count_nonzero (MADLIB_SCHEMA.svec) (
SFUNC = MADLIB_SCHEMA.svec_count,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.svec_plus,')
INITCOND = '{1}:{0.}', -- Zero
STYPE = MADLIB_SCHEMA.svec
);
--! Aggregate that turns a list of float8 values into an SVEC.
--!
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_agg (float8) CASCADE;
CREATE m4_ifdef(`__POSTGRESQL__', `', `ORDERED')
AGGREGATE MADLIB_SCHEMA.svec_agg (float8) (
SFUNC = MADLIB_SCHEMA.svec_pivot,
STYPE = MADLIB_SCHEMA.svec
);
--! Aggregate that computes the median element of a list of float8 values.
--!
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_median_inmemory(float8);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svec_median_inmemory (float8) CASCADE;
CREATE AGGREGATE MADLIB_SCHEMA.svec_median_inmemory (float8) (
SFUNC = MADLIB_SCHEMA.svec_pivot,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.svec_concat,')
FINALFUNC = MADLIB_SCHEMA.svec_median,
STYPE = MADLIB_SCHEMA.svec
);
--! Normalizes an SVEC that is divides all elements by its norm/magnitude.
--!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.normalize(MADLIB_SCHEMA.svec)
RETURNS MADLIB_SCHEMA.svec AS 'MODULE_PATHNAME', 'svec_normalize' LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');