blob: 66ceca7b5aba5f11ebc06018a215507020ddd072 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file online_sv.sql_in
*
* @brief SQL functions for support vector machines
* @sa For an introduction to Support vector machines (SVMs) and related kernel
* methods, see the module description \ref grp_kernmach.
*
*//* ------------------------------------------------------------------------*/
/**
@addtogroup grp_kernmach
@about
Support vector machines (SVMs) and related kernel methods have been one of
the most popular and well-studied machine learning techniques of the
past 15 years, with an amazing number of innovations and applications.
In a nutshell, an SVM model $f(x)$ takes the form of
\f[
f(x) = \sum_i \alpha_i k(x_i,x),
\f]
where each \f$ \alpha_i \f$ is a real number, each \f$ \boldsymbol x_i \f$ is a
data point from the training set (called a support vector), and
\f$ k(\cdot, \cdot) \f$ is a kernel function that measures how "similar" two
objects are. In regression, \f$ f(\boldsymbol x) \f$ is the regression function
we seek. In classification, \f$ f(\boldsymbol x) \f$ serves as
the decision boundary; so for example in binary classification, the predictor
can output class 1 for object $x$ if \f$ f(\boldsymbol x) \geq 0 \f$, and class
2 otherwise.
In the case when the kernel function \f$ k(\cdot, \cdot) \f$ is the standard
inner product on vectors, \f$ f(\boldsymbol x) \f$ is just an alternative way of
writing a linear function
\f[
f'(\boldsymbol x) = \langle \boldsymbol w, \boldsymbol x \rangle,
\f]
where \f$ \boldsymbol w \f$ is a weight vector having the same dimension as
\f$ \boldsymbol x \f$. One of the key points of SVMs is that we can use more
fancy kernel functions to efficiently learn linear models in high-dimensional
feature spaces, since \f$ k(\boldsymbol x_i, \boldsymbol x_j) \f$ can be
understood as an efficient way of computing an inner product in the feature
space:
\f[
k(\boldsymbol x_i, \boldsymbol x_j)
= \langle \phi(\boldsymbol x_i), \phi(\boldsymbol x_j) \rangle,
\f]
where \f$ \phi(\boldsymbol x) \f$ projects \f$ \boldsymbol x \f$ into a
(possibly infinite-dimensional) feature space.
There are many algorithms for learning kernel machines. This module
implements the class of online learning with kernels algorithms
described in Kivinen et al. [1]. See also the book Scholkopf and Smola [2]
for much more details.
The implementation follows the original description in the Kivinen et al
paper faithfully, except that we only update the support vector model
when we make a significant error. The original algorithms update the
support vector model at every step, even when no error was made, in the
name of regularisation. For practical purposes, and this is verified
empirically to a certain degree, updating only when necessary is both
faster and better from a learning-theoretic point of view, at least in
the i.i.d. setting.
Methods for classification, regression and novelty detection are
available. Multiple instances of the algorithms can be executed
in parallel on different subsets of the training data. The resultant
support vector models can then be combined using standard techniques
like averaging or majority voting.
Training data points are accessed via a table or a view. The support
vector models can also be stored in tables for fast execution.
@prereq
None at this point. Will need the \link grp_svec sparse vector SVEC
datatype\endlink eventually.
@usage
Here are the main learning functions.
- Regression learning is achieved through the following function
\code
madlib.sv_regression(input_table text, model_name text, parallel bool)
\endcode
- Classification learning is achieved through the following function
\code
madlib.sv_classification(input_table text, model_name text, parallel bool)
\endcode
- Novelty detection is achieved through the following function
\code
madlib.sv_novelty_detection(input_table text, model_name text, parallel bool)
\endcode
In each case, input_table is the name of the table/view with the training
data, model_name is the name under which we want to store the resultant
learned model, and parallel is a flag indicating whether the system
should learn multiple models in parallel. (The multiple models can be
combined to make predictions; more on that shortly.)
Here are the functions that can be used to make predictions on new
data points.
- To make predictions on new data points using a single model
learned previously, we use the function
\code
madlib.svs_predict(model_name text, x float8[]),
\endcode
where model_name is the name of the model stored and x is a data point.
- To make predictions on new data points using multiple models
learned in parallel, we use the function
\code
madlib.svs_predict_combo(model_name text, x float8[]),
\endcode
where model_name is the name under which the models are stored, and x
is a data point.
- To make predictions on new data points stored in a table using
previously learned models, we use the function
\code
madlib.sv_predict(input_table text, col_name text, model_name text, output_table text, parallel bool),
\endcode
where the data points are stored under column col_name in input_table,
output_table is the table into which we will store the results,
model_name is the name of the model previously learned, and parallel is
true if the model was learned in parallel, false otherwise.
The output_table is created during the function call; an existing table with
the same name will be dropped.
Models that have been stored can be deleted using the function
\code
madlib.drop_sv_model(modelname text).
\endcode
@examp
As a general first step, we need to prepare and populate an input
table/view with the following structure:
\code
CREATE TABLE my_schema.my_input_table
(
id INT, -- point ID
ind FLOAT8[], -- data point
label FLOAT8 -- label of data point
);
\endcode
Note: The label field is not required for novelty detection.
Example usage for regression:
-# We can randomly generate 1000 5-dimensional data labelled by the simple target function
\code
t(x) = if x[5] = 10 then 50 else if x[5] = -10 then 50 else 0;
\endcode
and store that in the madlib.sv_train_data table as follows:
\code
testdb=# select madlib.generateRegData(1000, 5);
\endcode
-# We can now learn a regression model and store the resultant model
under the name 'myexp'.
\code
testdb=# select madlib.sv_regression('madlib.sv_train_data', 'myexp', false);
\endcode
-# We can now start using it to predict the labels of new data points
like as follows:
\code
testdb=# select madlib.svs_predict('myexp', '{1,2,4,20,10}');
testdb=# select madlib.svs_predict('myexp', '{1,2,4,20,-10}');
\endcode
-# To learn multiple support vector models, we replace the learning step above by
\code
testdb=# select madlib.sv_regression('madlib.sv_train_data', 'myexp', true);
\endcode
The resultant models can be used for prediction as follows:
\code
testdb=# select * from madlib.svs_predict_combo('myexp', '{1,2,4,20,10}');
\endcode
-# We can also predict the labels of all the data points stored in a table.
For example, we can execute the following:
\code
testdb=# create table madlib.sv_reg_test ( id int, ind float8[] );
testdb=# insert into madlib.sv_reg_test (select id, ind from madlib.sv_train_data limit 20);
testdb=# select madlib.sv_predict('madlib.sv_reg_test', 'ind', 'myexp', 'madlib.sv_reg_output1', false);
testdb=# select * from madlib.sv_reg_output1;
testdb=# select madlib.sv_predict('madlib.sv_reg_test', 'ind', 'myexp', 'madlib.sv_reg_output2', true);
testdb=# select * from madlib.sv_reg_output2;
\endcode
Example usage for classification:
-# We can randomly generate 2000 5-dimensional data labelled by the simple
target function
\code
t(x) = if x[1] > 0 and x[2] < 0 then 1 else -1;
\endcode
and store that in the madlib.sv_train_data table as follows:
\code
testdb=# select madlib.generateClData(2000, 5);
\endcode
-# We can now learn a classification model and store the resultant model
under the name 'myexpc'.
\code
testdb=# select madlib.sv_classification('madlib.sv_train_data', 'myexpc', false);
\endcode
-# We can now start using it to predict the labels of new data points
like as follows:
\code
testdb=# select madlib.svs_predict('myexpc', '{10,-2,4,20,10}');
\endcode
-# To learn multiple support vector models, replace the model-building and prediction steps above by
\code
testdb=# select madlib.sv_classification('madlib.sv_train_data', 'myexpc', true);
testdb=# select * from madlib.svs_predict_combo('myexpc', '{10,-2,4,20,10}');
\endcode
Example usage for novelty detection:
-# We can randomly generate 100 2-dimensional data (the normal cases)
and store that in the madlib.sv_train_data table as follows:
\code
testdb=# select madlib.generateNdData(100, 2);
\endcode
-# Learning and predicting using a single novelty detection model can be done as follows:
\code
testdb=# select madlib.sv_novelty_detection('madlib.sv_train_data', 'myexpnd', false);
testdb=# select madlib.svs_predict('myexpnd', '{10,-10}');
testdb=# select madlib.svs_predict('myexpnd', '{-1,-1}');
\endcode
-# Learning and predicting using multiple models can be done as follows:
\code
testdb=# select madlib.sv_novelty_detection('madlib.sv_train_data', 'myexpnd', true);
testdb=# select * from madlib.svs_predict_combo('myexpnd', '{10,-10}');
testdb=# select * from madlib.svs_predict_combo('myexpnd', '{-1,-1}');
\endcode
@sa file online_sv.sql_in (documenting the SQL functions)
@internal
@sa namespace online_sv (documenting the implementation in Python)
@endinternal
@literature
[1] Jyrki Kivinen, Alexander J. Smola, and Robert C. Williamson: <em>Online
Learning with Kernels</em>, IEEE Transactions on Signal Processing, 52(8),
2165-2176, 2004.
[2] Bernhard Scholkopf and Alexander J. Smola: <em>Learning with Kernels:
Support Vector Machines, Regularization, Optimization, and Beyond</em>,
MIT Press, 2002.
*/
-- The following is the structure to record the results of a learning process.
-- We work with arrays of float8 for now; we'll extend the code to work with sparse vectors next.
-- DROP TYPE IF EXISTS MADLIB_SCHEMA.model_rec CASCADE;
CREATE TYPE MADLIB_SCHEMA.model_rec AS (
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
rho float8, -- classification margin
b float8, -- classifier offset
nsvs int, -- number of support vectors
ind_dim int, -- the dimension of the individuals
weights float8[], -- the weight of the support vectors
individuals float8[][] -- the array of support vectors
);
-- Create the necessary tables for storing training data and the learned support vector models
-- DROP TABLE IF EXISTS MADLIB_SCHEMA.sv_train_data CASCADE;
CREATE TABLE MADLIB_SCHEMA.sv_train_data ( id int, ind float8[], label float8 )
ifdef(`GREENPLUM', `DISTRIBUTED BY (id)')
;
-- DROP TABLE IF EXISTS MADLIB_SCHEMA.sv_results CASCADE;
CREATE TABLE MADLIB_SCHEMA.sv_results ( id text, model MADLIB_SCHEMA.model_rec )
ifdef(`GREENPLUM', `DISTRIBUTED BY (id)')
;
-- DROP TABLE IF EXISTS MADLIB_SCHEMA.sv_model CASCADE;
CREATE TABLE MADLIB_SCHEMA.sv_model ( id text, weight float8, sv float8[] ) ifdef(`GREENPLUM', `DISTRIBUTED BY (weight)')
;
-- Kernel functions are a generalisation of inner products.
-- They provide the means by which we can extend linear machines to work in non-linear transformed feature spaces.
-- Here we specify the dot product as the kernel; it can be replace with any other kernel, including the polynomial
-- and Gaussian kernels defined below.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kernel(x float8[][], idx int, y float8[]) RETURNS float8 AS $$
DECLARE
len INT;
ind float8[];
BEGIN
len := array_upper(y,1);
FOR i IN 1..len LOOP
ind[i] := x[idx][i];
END LOOP;
RETURN MADLIB_SCHEMA.kernel(ind, y);
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kernel(x float8[], y float8[]) RETURNS float8 AS $$
DECLARE
len INT;
BEGIN
RETURN MADLIB_SCHEMA.dot_kernel(x, y); -- this doesn't require svecs
-- RETURN dot(ind, y); -- this does require svecs
-- RETURN MADLIB_SCHEMA.polynomial_kernel(ind, y, 2);
END
$$ LANGUAGE plpgsql;
-- This is just inner product. For efficiency, this can be implemented as a C UDF. In fact, if the sparse vector library
-- is installed, one can just define the body of the function to be RETURN dot(x,y);.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dot_kernel(x float8[], y float8[]) RETURNS float8 AS $$
DECLARE
len int;
ret float8 := 0;
BEGIN
len := array_upper(y,1);
FOR i in 1..len LOOP
ret := ret + x[i]*y[i];
END LOOP;
RETURN ret;
END;
$$ LANGUAGE plpgsql;
-- Here are two other standard kernels.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.polynomial_kernel(x float8[], y float8[], degree int) RETURNS float8 AS $$
BEGIN
RETURN MADLIB_SCHEMA.dot_kernel(x,y) ^ degree;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gaussian_kernel(x float8[], y float8[], gamma float) RETURNS float8 AS $$
BEGIN
RETURN exp(-1.0 * gamma * MADLIB_SCHEMA.dot_kernel(x-y,x-y));
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svs_predict(svs MADLIB_SCHEMA.model_rec, ind float8[])
RETURNS float8 AS $$
DECLARE
ret FLOAT8 := 0;
BEGIN
FOR i IN 1..svs.nsvs LOOP
ret := ret + svs.weights[i] * MADLIB_SCHEMA.kernel(svs.individuals, i, ind);
END LOOP;
RETURN ret;
END;
$$ LANGUAGE plpgsql;
-- This is the main online support vector regression learning algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
-- The learning parameters (eta, slambda, and nu) are hardcoded at the moment.
-- We may want to make them input parameters at some stage, although the naive user would probably be daunted with the prospect
-- of having to specify them.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.online_sv_reg_update(svs MADLIB_SCHEMA.model_rec, ind float8[], label float8)
RETURNS MADLIB_SCHEMA.model_rec AS $$
DECLARE
eta FLOAT8 := 0.05; -- learning rate
slambda FLOAT8 := 0.2; -- regularisation parameter
nu FLOAT8 := 0.001; -- compression parameter, a number between 0 and 1; the fraction of the training data that appear as support vectors
p FLOAT8; -- prediction for the input individual
diff FLOAT8; -- difference between p and label
error FLOAT8; -- absolute value of diff
weight FLOAT8; -- the weight of ind if it turns out to be a support vector
BEGIN
IF svs IS NULL THEN
svs := (0, 0, 0, 0.5, 1, 1, array_upper(ind,1), '{0}', array[ind]); -- we have to be careful to initialise a multi-dimensional array
END IF;
p := MADLIB_SCHEMA.svs_predict(svs, ind);
diff := label - p;
error := abs(diff);
svs.inds := svs.inds + 1;
svs.cum_err := svs.cum_err + error;
IF (error > svs.epsilon) THEN
FOR i IN 1..svs.nsvs LOOP -- Unlike the original algorithm, this rescaling is only done when we make a large enough error.
svs.weights[i] := svs.weights[i] * (1 - eta * slambda);
END LOOP;
weight := eta;
IF (diff < 0) THEN weight := -1 * weight; END IF;
svs.nsvs := svs.nsvs + 1;
svs.weights[svs.nsvs] := weight;
svs.individuals := array_cat(svs.individuals, ind);
svs.epsilon := svs.epsilon + (1 - nu) * eta;
ELSE
svs.epsilon := svs.epsilon - eta * nu;
END IF;
return svs;
END
$$ LANGUAGE plpgsql;
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.online_sv_reg_agg(float8[], float8);
CREATE AGGREGATE MADLIB_SCHEMA.online_sv_reg_agg(float8[], float8) (
sfunc = MADLIB_SCHEMA.online_sv_reg_update,
stype = MADLIB_SCHEMA.model_rec
);
-- This is the main online support vector classification algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
-- The learning parameters (eta and nu) are hardcoded at the moment.
-- We may want to make them input parameters at some stage, although the naive user would probably be daunted with the prospect
-- of having to specify them.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.online_sv_cl_update(svs MADLIB_SCHEMA.model_rec, ind float8[], label float8)
RETURNS MADLIB_SCHEMA.model_rec AS $$
DECLARE
eta FLOAT8 := 0.05; -- learning rate
nu FLOAT8 := 0.2; -- the fraction of the training data with margin error, a number between 0 and 1; small nu => large margin and more support vectors
p FLOAT8; -- prediction for the input individual
BEGIN
IF svs IS NULL THEN
svs := (0, 0, 0, 0.5, 1, 1, array_upper(ind,1), '{0}', array[ind]); -- we have to be careful to initialise a multi-dimensional array
END IF;
p := label * (MADLIB_SCHEMA.svs_predict(svs, ind) + svs.b);
svs.inds := svs.inds + 1;
IF p < 0 THEN
svs.cum_err := svs.cum_err + 1;
END IF;
IF (p < svs.rho) THEN
FOR i IN 1..svs.nsvs LOOP -- Unlike the original algorithm, this rescaling is only done when we make a margin error.
svs.weights[i] := svs.weights[i] * (1 - eta);
END LOOP;
svs.nsvs := svs.nsvs + 1;
svs.weights[svs.nsvs] := label * eta;
svs.individuals := array_cat(svs.individuals, ind);
svs.b := svs.b + eta * label;
svs.rho := svs.rho + eta * (1 - nu);
ELSE
svs.rho := svs.rho - eta * nu;
END IF;
return svs;
END
$$ LANGUAGE plpgsql;
-- This is the main online support vector novelty detection algorithm.
-- The function updates the support vector model as it processes each new training example.
-- In contrast to classification and regression, the training data points have no labels.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
-- The learning parameters (eta and nu) are hardcoded at the moment.
-- We may want to make them input parameters at some stage, although the naive user would probably be daunted with the prospect
-- of having to specify them.
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.online_sv_cl_agg(float8[], float8);
CREATE AGGREGATE MADLIB_SCHEMA.online_sv_cl_agg(float8[], float8) (
sfunc = MADLIB_SCHEMA.online_sv_cl_update,
stype = MADLIB_SCHEMA.model_rec
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.online_sv_nd_update(svs MADLIB_SCHEMA.model_rec, ind float8[])
RETURNS MADLIB_SCHEMA.model_rec AS $$
DECLARE
eta FLOAT8 := 0.1; -- learning rate
nu FLOAT8 := 0.05; -- the fraction of the training data with margin error, a number between 0 and 1
p FLOAT8; -- prediction for the input individual
BEGIN
IF svs IS NULL THEN
svs := (0, 0, 0, 0.5, 1, 1, array_upper(ind,1), '{0}', array[ind]); -- we have to be careful to initialise a multi-dimensional array
END IF;
p := MADLIB_SCHEMA.svs_predict(svs, ind);
svs.inds := svs.inds + 1;
IF (p < svs.rho) THEN
FOR i IN 1..svs.nsvs LOOP -- Unlike the original algorithm, this rescaling is only done when we make a margin error.
svs.weights[i] := svs.weights[i] * (1 - eta);
END LOOP;
svs.nsvs := svs.nsvs + 1;
svs.weights[svs.nsvs] := eta;
svs.individuals := array_cat(svs.individuals, ind);
svs.rho := svs.rho + eta * (1 - nu);
ELSE
svs.rho := svs.rho - eta * nu;
END IF;
return svs;
END
$$ LANGUAGE plpgsql;
-- DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.online_sv_nd_agg(float8[]);
CREATE AGGREGATE MADLIB_SCHEMA.online_sv_nd_agg(float8[]) (
sfunc = MADLIB_SCHEMA.online_sv_nd_update,
stype = MADLIB_SCHEMA.model_rec
);
-- This function transforms a MADLIB_SCHEMA.model_rec into a set of (weight, support_vector) values for the purpose of storage in a table.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.transform_rec(modelname text, ind_dim int, weights float8[], individuals float8[][]) RETURNS SETOF MADLIB_SCHEMA.sv_model AS $$
DECLARE
nsvs INT;
sv MADLIB_SCHEMA.sv_model;
BEGIN
nsvs = array_upper(weights,1);
FOR i IN 1..nsvs LOOP
sv.id = modelname;
sv.weight = weights[i];
FOR j IN 1..ind_dim LOOP sv.sv[j] = individuals[i][j]; END LOOP; -- we copy the individual because we can't say sv.sv[j] = individuals[i]
RETURN NEXT sv;
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- This function stores a MADLIB_SCHEMA.model_rec stored with modelname in the MADLIB_SCHEMA.sv_results table into the MADLIB_SCHEMA.sv_model table.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.storeModel(modelname TEXT) RETURNS VOID AS $$
DECLARE
myind_dim INT;
myweights float8[];
myindividuals float8[][];
-- mysvs MADLIB_SCHEMA.model_rec;
BEGIN
-- SELECT INTO mysvs model FROM MADLIB_SCHEMA.sv_results WHERE id = modelname; -- for some strange reason this line doesn't work....
SELECT INTO myind_dim (model).ind_dim FROM MADLIB_SCHEMA.sv_results WHERE id = modelname;
SELECT INTO myweights (model).weights FROM MADLIB_SCHEMA.sv_results WHERE id = modelname;
SELECT INTO myindividuals (model).individuals FROM MADLIB_SCHEMA.sv_results WHERE id = modelname;
INSERT INTO MADLIB_SCHEMA.sv_model (SELECT * FROM MADLIB_SCHEMA.transform_rec(modelname, myind_dim, myweights, myindividuals));
END;
$$ LANGUAGE plpgsql;
-- This function stores a collection of models learned in parallel into the MADLIB_SCHEMA.sv_model table.
-- The different models are assumed to be named modelname1, modelname2, ....
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.storeModel(modelname TEXT, n INT) RETURNS VOID AS $$
DECLARE
BEGIN
FOR i IN 0..n-1 LOOP
PERFORM MADLIB_SCHEMA.storeModel(modelname || i);
END LOOP;
END;
$$ LANGUAGE plpgsql;
/**
* @brief Evaluates a support-vector model on a given data point
*
* @param modelname The learned model \f$ f \f$ to be used
* @param ind The data point \f$ \boldsymbol x \f$
* @return This function returns \f$ f(\boldsymbol x) \f$
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svs_predict(modelname text, ind float8[], OUT ret float8) RETURNS FLOAT8 AS $$
BEGIN
SELECT INTO ret sum(weight * MADLIB_SCHEMA.kernel(sv, ind)) FROM MADLIB_SCHEMA.sv_model WHERE id = modelname;
END;
$$ LANGUAGE plpgsql;
-- DROP TYPE IF EXISTS MADLIB_SCHEMA.model_pr CASCADE;
CREATE TYPE MADLIB_SCHEMA.model_pr AS ( model text, prediction float8 );
/**
* @brief Evaluates multiple support-vector models on a data point
*
* This function performs prediction using the support vector machines stored in
* the <tt>MADLIB_SCHEMA.sv_model</tt> table.
* The different models are assumed to be named <tt><em>modelname</em>1</tt>,
* <tt><em>modelname</em>2</tt>, ....
* An average prediction is given at the end.
*
* @param modelname Prefix of the model names.
* @param ind The data point \f$ \boldsymbol x \f$
* @return This function returns a table: For each model, there is a row
* containing the result returned by \ref svs_predict(). Moreover, the last
* row contains the average value, over all models.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svs_predict_combo(modelname text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.model_pr AS $$
DECLARE
sumpr float8 := 0;
mpr MADLIB_SCHEMA.model_pr;
n int;
BEGIN
SELECT INTO n COUNT(DISTINCT(id)) FROM MADLIB_SCHEMA.sv_model WHERE position(modelname in id) > 0 AND modelname <> id;
IF n = 0 THEN
RAISE EXCEPTION 'No parallel model with name prefix % found', modelname;
EXIT;
END IF;
FOR i IN 0..n-1 LOOP
mpr.model := modelname || i;
mpr.prediction := MADLIB_SCHEMA.svs_predict(mpr.model, ind);
sumpr := sumpr + mpr.prediction;
RETURN NEXT mpr;
END LOOP;
mpr.model := 'avg';
mpr.prediction := sumpr / n;
RETURN NEXT mpr;
END;
$$ LANGUAGE plpgsql;
-- This function removes all the models whose id is prefixed with modelname in the sv_results table
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.drop_sv_model(modelname text)
RETURNS VOID
AS $$
sql = 'DELETE FROM MADLIB_SCHEMA.sv_results WHERE position(\'' + modelname + '\' in id) > 0';
plpy.execute(sql);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param modelname The name under which we want to store the resultant learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::sv_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.sv_regression(input_table text, modelname text, parallel bool)
RETURNS TEXT
AS $$
import sys
try:
from madlib import online_sv
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import online_sv
return online_sv.sv_regression(input_table, modelname, parallel);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector classification function
*
* @param input_table The name of the table/view with the training data
* @param modelname The name under which we want to store the resultant learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::sv_classification().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.sv_classification(input_table text, modelname text, parallel bool)
RETURNS TEXT
AS $$
import sys
try:
from madlib import online_sv
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import online_sv
return online_sv.sv_classification(input_table, modelname, parallel);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param modelname The name under which we want to store the resultant learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::sv_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.sv_novelty_detection(input_table text, modelname text, parallel bool)
RETURNS TEXT
AS $$
import sys
try:
from madlib import online_sv
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import online_sv
return online_sv.sv_novelty_detection(input_table, modelname, parallel);
$$ LANGUAGE 'plpythonu';
/**
* @brief Scores the data points stored in a table using a learned support-vector model
*
* @param input_table Name of table/view containing the data points to be scored
* @param col_name Name of column in input_table containing the data points
* @param modelname Name of the learned model to be used for scoring
* @param output_table Name of table to store the results
* @param parallel A flag indicating whether the model to be used was learned in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::sv_predict().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.sv_predict(input_table text, col_name text, modelname text, output_table text, parallel bool)
RETURNS TEXT
AS $$
import sys
try:
from madlib import online_sv
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import online_sv
return online_sv.sv_predict(input_table, col_name, modelname, output_table, parallel);
$$ LANGUAGE 'plpythonu';
-- Generate artificial training data
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.randomInd(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..(d-1) LOOP
ret[i] = RANDOM() * 40 - 20;
END LOOP;
IF (RANDOM() > 0.5) THEN
ret[d] = 10;
ELSE
ret[d] = -10;
END IF;
RETURN ret;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.randomInd2(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..d LOOP
ret[i] = RANDOM() * 5 + 10;
END LOOP;
RETURN ret;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.generateRegData(num int, dim int) RETURNS VOID AS $$
plpy.execute("DELETE FROM MADLIB_SCHEMA.sv_train_data")
plpy.execute("INSERT INTO MADLIB_SCHEMA.sv_train_data SELECT a.val, MADLIB_SCHEMA.randomInd(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
plpy.execute("UPDATE MADLIB_SCHEMA.sv_train_data SET label = MADLIB_SCHEMA.targetRegFunc(ind)")
$$ LANGUAGE 'plpythonu';
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.targetRegFunc(ind float8[]) RETURNS float8 AS $$
DECLARE
dim int;
BEGIN
dim = array_upper(ind,1);
IF (ind[dim] = 10) THEN RETURN 50; END IF;
RETURN -50;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.generateClData(num int, dim int) RETURNS VOID AS $$
plpy.execute("DELETE FROM MADLIB_SCHEMA.sv_train_data")
plpy.execute("INSERT INTO MADLIB_SCHEMA.sv_train_data SELECT a.val, MADLIB_SCHEMA.randomInd(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
plpy.execute("UPDATE MADLIB_SCHEMA.sv_train_data SET label = MADLIB_SCHEMA.targetClFunc(ind)")
$$ LANGUAGE 'plpythonu';
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.targetClFunc(ind float8[]) RETURNS float8 AS $$
BEGIN
IF (ind[1] > 0 AND ind[2] < 0) THEN RETURN 1; END IF;
RETURN -1;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.generateNdData(num int, dim int) RETURNS VOID AS $$
plpy.execute("DELETE FROM MADLIB_SCHEMA.sv_train_data")
plpy.execute("INSERT INTO MADLIB_SCHEMA.sv_train_data SELECT a.val, MADLIB_SCHEMA.randomInd2(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
$$ LANGUAGE 'plpythonu';