blob: 216c6d3cf63b5c17ffeb41c7f03c411ef168ec8b [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file online_sv.sql_in
*
* @brief SQL functions for support vector machines
* @sa For an introduction to Support vector machines (SVMs) and related kernel
* methods, see the module description \ref grp_kernmach.
*
*//* ------------------------------------------------------------------------*/
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_kernmach
@about
Support vector machines (SVMs) and related kernel methods have been one of
the most popular and well-studied machine learning techniques of the
past 15 years, with an amazing number of innovations and applications.
In a nutshell, an SVM model $f(x)$ takes the form of
\f[
f(x) = \sum_i \alpha_i k(x_i,x),
\f]
where each \f$ \alpha_i \f$ is a real number, each \f$ \boldsymbol x_i \f$ is a
data point from the training set (called a support vector), and
\f$ k(\cdot, \cdot) \f$ is a kernel function that measures how "similar" two
objects are. In regression, \f$ f(\boldsymbol x) \f$ is the regression function
we seek. In classification, \f$ f(\boldsymbol x) \f$ serves as
the decision boundary; so for example in binary classification, the predictor
can output class 1 for object $x$ if \f$ f(\boldsymbol x) \geq 0 \f$, and class
2 otherwise.
In the case when the kernel function \f$ k(\cdot, \cdot) \f$ is the standard
inner product on vectors, \f$ f(\boldsymbol x) \f$ is just an alternative way of
writing a linear function
\f[
f'(\boldsymbol x) = \langle \boldsymbol w, \boldsymbol x \rangle,
\f]
where \f$ \boldsymbol w \f$ is a weight vector having the same dimension as
\f$ \boldsymbol x \f$. One of the key points of SVMs is that we can use more
fancy kernel functions to efficiently learn linear models in high-dimensional
feature spaces, since \f$ k(\boldsymbol x_i, \boldsymbol x_j) \f$ can be
understood as an efficient way of computing an inner product in the feature
space:
\f[
k(\boldsymbol x_i, \boldsymbol x_j)
= \langle \phi(\boldsymbol x_i), \phi(\boldsymbol x_j) \rangle,
\f]
where \f$ \phi(\boldsymbol x) \f$ projects \f$ \boldsymbol x \f$ into a
(possibly infinite-dimensional) feature space.
There are many algorithms for learning kernel machines. This module
implements the class of online learning with kernels algorithms
described in Kivinen et al. [1]. See also the book Scholkopf and Smola [2]
for much more details.
The implementation follows the original description in the Kivinen et al
paper faithfully, except that we only update the support vector model
when we make a significant error. The original algorithms update the
support vector model at every step, even when no error was made, in the
name of regularisation. For practical purposes, and this is verified
empirically to a certain degree, updating only when necessary is both
faster and better from a learning-theoretic point of view, at least in
the i.i.d. setting.
Methods for classification, regression and novelty detection are
available. Multiple instances of the algorithms can be executed
in parallel on different subsets of the training data. The resultant
support vector models can then be combined using standard techniques
like averaging or majority voting.
Training data points are accessed via a table or a view. The support
vector models can also be stored in tables for fast execution.
@input
For classification and regression, the training table/view is expected to be of the following form:\n
<pre>{TABLE|VIEW} <em>input_table</em> (
...
<em>id</em> INT,
<em>ind</em> FLOAT8[],
<em>label</em> FLOAT8,
...
)</pre>
For novelty detection, the label field is not required.
@usage
- Regression learning is achieved through the following function:
<pre>SELECT \ref svm_regression(
'<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',
<em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>, <em>slambda DEFAULT 0.05</em>
);</pre>
- Classification learning is achieved through the following function:
<pre>SELECT \ref svm_classification(
'<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',
<em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>
);</pre>
- Novelty detection is achieved through the following function:
<pre>SELECT \ref svm_novelty_detection(
'<em>input_table</em>', '<em>model_table</em>', <em>parallel</em>, '<em>kernel_func</em>',
<em>verbose DEFAULT false</em>, <em>eta DEFAULT 0.1</em>, <em>nu DEFAULT 0.005</em>
);</pre>
Assuming the model_table parameter takes on value 'model', each learning function will produce two tables
as output: 'model' and 'model_param'.
The first contains the support vectors of the model(s) learned.
The second contains the parameters of the model(s) learned, which includes information like the kernel function
used and the value of the intercept, if there is one.
- To make predictions on a single data point x using a single model
learned previously, we use the function
<pre>SELECT \ref svm_predict('<em>model_table</em>',<em>x</em>);</pre>
- To make predictions on new data points using multiple models
learned in parallel, we use the function
<pre>SELECT \ref svm_predict_combo('<em>model_table</em>',<em>x</em>);</pre>
- Note that, at the moment, we cannot use MADLIB_SCHEMA.svm_predict() and MADLIB_SCHEMA.svm_predict_combo()
on multiple data points. For example, something like the following will fail:
<pre>SELECT \ref svm_predict('<em>model_table</em>',<em>x</em>) FROM data_table;</pre>
Instead, to make predictions on new data points stored in a table using
previously learned models, we use the function:
<pre>SELECT \ref svm_predict_batch('<em>input_table</em>', '<em>data_col</em>', '<em>id</em>', '<em>model_table</em>', '<em>output_table</em>', <em>parallel</em>);</pre>
The output_table is created during the function call; an existing table with
the same name will be dropped.
If the parallel parameter is true, then each data point in the input table will have multiple
predicted values corresponding to the number of models learned in parallel.
@implementation
Currently, three kernel functions have been implemented: dot product (\ref svm_dot), polynomial (\ref svm_polynomial) and Gaussian (\ref svm_gaussian) kernels. To use the dot product kernel function,
simply use '<tt><em>MADLIB_SCHEMA.svm_dot</em></tt>' as the <tt>kernel_func</tt> argument, which accepts any function that takes in two float[] and returns a float. To use the polynomial or Gaussian kernels,
a wrapper function is needed since these kernels require additional input parameters (see online_sv.sql_in for input parameters).
For example, to use the polynomial kernel with degree 2, first create a wrapper function:
<pre>CREATE OR REPLACE FUNCTION mykernel(FLOAT[],FLOAT[]) RETURNS FLOAT AS $$
SELECT \ref svm_polynomial($1,$2,2)
$$ language sql;</pre>
Then call the SVM learning functions with <tt>mykernel</tt> as the argument to <tt>kernel_func</tt>.
<pre>SELECT \ref svm_regression('my_schema.my_train_data', 'mymodel', false, 'mykernel');</pre>
To drop all tables pertaining to the model, we can use
<pre>SELECT \ref svm_drop_model('model_table');</pre>
@examp
As a general first step, we need to prepare and populate an input
table/view with the following structure:
\code
TABLE/VIEW my_schema.my_input_table
(
id INT, -- point ID
ind FLOAT8[], -- data point
label FLOAT8 -- label of data point
);
\endcode
Note: The label field is not required for novelty detection.
<strong>Example usage for regression</strong>:
-# We can randomly generate 1000 5-dimensional data labelled by the simple target function
\code
t(x) = if x[5] = 10 then 50 else if x[5] = -10 then 50 else 0;
\endcode
and store that in the my_schema.my_train_data table as follows:
\code
sql> select MADLIB_SCHEMA.svm_generate_reg_data('my_schema.my_train_data', 1000, 5);
\endcode
-# We can now learn a regression model and store the resultant model
under the name 'myexp'.
\code
sql> select MADLIB_SCHEMA.svm_regression('my_schema.my_train_data', 'myexp', false, 'MADLIB_SCHEMA.svm_dot');
\endcode
-# We can now start using it to predict the labels of new data points
like as follows:
\code
sql> select MADLIB_SCHEMA.svm_predict('myexp', '{1,2,4,20,10}');
sql> select MADLIB_SCHEMA.svm_predict('myexp', '{1,2,4,20,-10}');
\endcode
-# To learn multiple support vector models, we replace the learning step above by
\code
sql> select MADLIB_SCHEMA.svm_regression('my_schema.my_train_data', 'myexp', true, 'MADLIB_SCHEMA.svm_dot');
\endcode
The resultant models can be used for prediction as follows:
\code
sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexp', '{1,2,4,20,10}');
\endcode
-# We can also predict the labels of all the data points stored in a table.
For example, we can execute the following:
\code
sql> create table MADLIB_SCHEMA.svm_reg_test ( id int, ind float8[] );
sql> insert into MADLIB_SCHEMA.svm_reg_test (select id, ind from my_schema.my_train_data limit 20);
sql> select MADLIB_SCHEMA.svm_predict_batch('MADLIB_SCHEMA.svm_reg_test', 'ind', 'id', 'myexp', 'MADLIB_SCHEMA.svm_reg_output1', false);
sql> select * from MADLIB_SCHEMA.svm_reg_output1;
sql> select MADLIB_SCHEMA.svm_predict_batch('MADLIB_SCHEMA.svm_reg_test', 'ind', 'id, 'myexp', 'MADLIB_SCHEMA.svm_reg_output2', true);
sql> select * from MADLIB_SCHEMA.svm_reg_output2;
\endcode
<strong>Example usage for classification:</strong>
-# We can randomly generate 2000 5-dimensional data labelled by the simple
target function
\code
t(x) = if x[1] > 0 and x[2] < 0 then 1 else -1;
\endcode
and store that in the my_schema.my_train_data table as follows:
\code
sql> select MADLIB_SCHEMA.svm_generate_cls_data('my_schema.my_train_data', 2000, 5);
\endcode
-# We can now learn a classification model and store the resultant model
under the name 'myexpc'.
\code
sql> select MADLIB_SCHEMA.svm_classification('my_schema.my_train_data', 'myexpc', false, 'MADLIB_SCHEMA.svm_dot');
\endcode
-# We can now start using it to predict the labels of new data points
like as follows:
\code
sql> select MADLIB_SCHEMA.svm_predict('myexpc', '{10,-2,4,20,10}');
\endcode
-# To learn multiple support vector models, replace the model-building and prediction steps above by
\code
sql> select MADLIB_SCHEMA.svm_classification('my_schema.my_train_data', 'myexpc', true, 'MADLIB_SCHEMA.svm_dot');
sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpc', '{10,-2,4,20,10}');
\endcode
<strong>Example usage for novelty detection:</strong>
-# We can randomly generate 100 2-dimensional data (the normal cases)
and store that in the my_schema.my_train_data table as follows:
\code
sql> select MADLIB_SCHEMA.svm_generate_nd_data('my_schema.my_train_data', 100, 2);
\endcode
-# Learning and predicting using a single novelty detection model can be done as follows:
\code
sql> select MADLIB_SCHEMA.svm_novelty_detection('my_schema.my_train_data', 'myexpnd', false, 'MADLIB_SCHEMA.svm_dot');
sql> select MADLIB_SCHEMA.svm_predict('myexpnd', '{10,-10}');
sql> select MADLIB_SCHEMA.svm_predict('myexpnd', '{-1,-1}');
\endcode
-# Learning and predicting using multiple models can be done as follows:
\code
sql> select MADLIB_SCHEMA.svm_novelty_detection('my_schema.my_train_data', 'myexpnd', true, 'MADLIB_SCHEMA.svm_dot');
sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpnd', '{10,-10}');
sql> select * from MADLIB_SCHEMA.svm_predict_combo('myexpnd', '{-1,-1}');
\endcode
@literature
[1] Jyrki Kivinen, Alexander J. Smola, and Robert C. Williamson: <em>Online
Learning with Kernels</em>, IEEE Transactions on Signal Processing, 52(8),
2165-2176, 2004.
[2] Bernhard Scholkopf and Alexander J. Smola: <em>Learning with Kernels:
Support Vector Machines, Regularization, Optimization, and Beyond</em>,
MIT Press, 2002.
@sa File online_sv.sql_in documenting the SQL functions.
@internal
@sa namespace online_sv (documenting the implementation in Python)
@endinternal
*/
-- The following is the structure to record the results of a learning process.
-- We work with arrays of float8 for now; we'll extend the code to work with sparse vectors next.
--
CREATE TYPE MADLIB_SCHEMA.svm_model_rec AS (
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
rho float8, -- classification margin
b float8, -- classifier offset
nsvs int, -- number of support vectors
ind_dim int, -- the dimension of the individuals
weights float8[], -- the weight of the support vectors
individuals float8[], -- the array of support vectors, represented as a 1-D array
kernel_oid oid -- OID of kernel function
);
-- The following is the return type of a regression learning process
--
CREATE TYPE MADLIB_SCHEMA.svm_reg_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
b float8, -- classifier offset
nsvs int -- number of support vectors
);
-- The following is the return type of a classification learning process
--
CREATE TYPE MADLIB_SCHEMA.svm_cls_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
rho float8, -- classification margin
b float8, -- classifier offset
nsvs int -- number of support vectors
);
-- The following is the return type of a novelty detection learning process
--
CREATE TYPE MADLIB_SCHEMA.svm_nd_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
rho float8, -- classification margin
nsvs int -- number of support vectors
);
-- The type for representing support vectors
--
CREATE TYPE MADLIB_SCHEMA.svm_support_vector AS ( id text, weight float8, sv float8[] );
-- Kernel functions are a generalisation of inner products.
-- They provide the means by which we can extend linear machines to work in non-linear transformed feature spaces.
-- Here are a few standard kernels: dot product, polynomial kernel, Gaussian kernel.
--
/**
* @brief Dot product kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @return Returns dot product of the two data points.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_dot(x float8[], y float8[]) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_dot' LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Polynomial kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @param degree The degree \f$ d \f$
* @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=(\boldsymbol x \cdot \boldsymbol y)^d \f$
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_polynomial(x float8[], y float8[], degree float8) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_polynomial' LANGUAGE C IMMUTABLE STRICT;
/**
* @brief Gaussian kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @param gamma The spread \f$ \gamma \f$
* @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=exp(-\gamma || \boldsymbol x \cdot \boldsymbol y ||^2 ) \f$
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_gaussian(x float8[], y float8[], gamma float8) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_gaussian' LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict_sub(int,int,float8[],float8[],float8[],text) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_predict_sub' LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(svs MADLIB_SCHEMA.svm_model_rec, ind float8[], kernel text)
RETURNS float8 AS $$
SELECT MADLIB_SCHEMA.svm_predict_sub($1.nsvs, $1.ind_dim, $1.weights, $1.individuals, $2, $3);
$$ LANGUAGE SQL;
-- This is the main online support vector regression learning algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_reg_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8, slambda FLOAT8)
RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_reg_update' LANGUAGE C STRICT;
CREATE AGGREGATE MADLIB_SCHEMA.svm_reg_agg(float8[], float8, text, float8, float8, float8) (
sfunc = MADLIB_SCHEMA.svm_reg_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This is the main online support vector classification learning algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_cls_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], label FLOAT8, kernel TEXT, eta FLOAT8, nu FLOAT8)
RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_cls_update' LANGUAGE C STRICT;
CREATE AGGREGATE MADLIB_SCHEMA.svm_cls_agg(float8[], float8, text, float8, float8) (
sfunc = MADLIB_SCHEMA.svm_cls_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This is the main online support vector novelty detection algorithm.
-- The function updates the support vector model as it processes each new training example.
-- In contrast to classification and regression, the training data points have no labels.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_nd_update(svs MADLIB_SCHEMA.svm_model_rec, ind FLOAT8[], kernel TEXT, eta FLOAT8, nu FLOAT8)
RETURNS MADLIB_SCHEMA.svm_model_rec AS 'MODULE_PATHNAME', 'svm_nd_update' LANGUAGE C STRICT;
CREATE AGGREGATE MADLIB_SCHEMA.svm_nd_agg(float8[], text, float8, float8) (
sfunc = MADLIB_SCHEMA.svm_nd_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This function stores a MADLIB_SCHEMA.svm_model_rec stored in model_temp_table into the model_table.
--
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_store_model(model_temp_table TEXT, model_name TEXT, model_table TEXT) RETURNS VOID AS $$
sql = "SELECT COUNT(*) FROM " + model_temp_table + " WHERE id = \'" + model_name + "\'";
temp = plpy.execute(sql);
if (temp[0]['count'] == 0):
plpy.error("No support vector model with name " + model_name + " found.");
sql = "SELECT (model).ind_dim, (model).nsvs" \
+ " FROM " + model_temp_table + " WHERE id = '" + model_name + "'";
rv = plpy.execute(sql);
myind_dim = rv[0]['ind_dim'];
mynsvs = rv[0]['nsvs'];
if (mynsvs == 0):
plpy.error("The specified model has no support vectors and therefore not processed");
idx = 0;
for i in range(1,mynsvs+1):
idx = myind_dim * (i-1);
sql = "INSERT INTO " + model_table \
+ " SELECT \'" + model_name + "\', (model).weights[" + str(i) + "], " \
+ " (model).individuals[(" + str(idx+1) + "):(" + str(idx) + "+" + str(myind_dim) + ")] " \
+ " FROM " + model_temp_table + " WHERE id = \'" + model_name + "\' LIMIT 1";
plpy.execute(sql);
$$ LANGUAGE plpythonu;
-- This function stores a collection of models learned in parallel into the model_table.
-- The different models are stored in model_temp_table and are assumed to be named model_table1, model_table2, ....
--
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_store_model(model_temp_table TEXT, model_table TEXT, model_name TEXT, n INT) RETURNS VOID AS $$
for i in range(0,n):
plpy.execute("select MADLIB_SCHEMA.svm_store_model('" + model_temp_table + "', '" + model_name + str(i) + "', '" + model_table + "')");
$$ LANGUAGE plpythonu;
/**
* @brief Drops all tables pertaining to a model
*
* @param model_table The table to be dropped.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_drop_model(model_table TEXT) RETURNS VOID AS $$
plpy.execute("drop table if exists " + model_table)
plpy.execute("drop table if exists " + model_table + "_param")
$$ LANGUAGE plpythonu;
CREATE TYPE MADLIB_SCHEMA.svm_model_pr AS ( model text, prediction float8 );
/**
* @brief Evaluates a support-vector model on a given data point
*
* @param model_table The table storing the learned model \f$ f \f$ to be used
* @param ind The data point \f$ \boldsymbol x \f$
* @return This function returns \f$ f(\boldsymbol x) \f$
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_predict(model_table text, ind float8[]) RETURNS FLOAT8 AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_predict(model_table, ind);
$$ LANGUAGE plpythonu;
/**
* @brief Evaluates multiple support-vector models on a data point
*
* @param model_table The table storing the learned models to be used.
* @param ind The data point \f$ \boldsymbol x \f$
* @return This function returns a table, a row for each model.
* Moreover, the last row contains the average value, over all models.
*
* The different models are assumed to be named <tt><em>model_table</em>1</tt>,
* <tt><em>model_table</em>2</tt>, ....
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_predict_combo(model_table text, ind float8[]) RETURNS SETOF MADLIB_SCHEMA.svm_model_pr AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_predict_combo( MADlibSchema, model_table, ind);
$$ LANGUAGE plpythonu;
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel bool, kernel_func text)
RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_regression( MADlibSchema, input_table, model_table, parallel, kernel_func);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param slambda Regularisation parameter
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8, slambda float8)
RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_regression( MADlibSchema, input_table, model_table, parallel, kernel_func, verbose, eta, nu, slambda);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector classification function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_classification().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel bool, kernel_func text)
RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_classification( MADlibSchema, input_table, model_table, parallel, kernel_func);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector classification function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_classification().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_classification(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8)
RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_classification( MADlibSchema, input_table, model_table, parallel, kernel_func, verbose, eta, nu);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel bool, kernel_func text)
RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_novelty_detection( MADlibSchema, input_table, model_table, parallel, kernel_func);
$$ LANGUAGE 'plpythonu';
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel bool, kernel_func text, verbose bool, eta float8, nu float8)
RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_novelty_detection( MADlibSchema, input_table, model_table, parallel, kernel_func, verbose, eta, nu);
$$ LANGUAGE 'plpythonu';
/**
* @brief Scores the data points stored in a table using a learned support-vector model
*
* @param input_table Name of table/view containing the data points to be scored
* @param data_col Name of column in input_table containing the data points
* @param id_col Name of column in input_table containing the integer identifier of data points
* @param model_table Name of table where the learned model to be used is stored
* @param output_table Name of table to store the results
* @param parallel A flag indicating whether the model to be used was learned in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::svm_predict_batch().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_predict_batch(input_table text, data_col text, id_col text, model_table text, output_table text, parallel bool)
RETURNS TEXT
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# MADlibSchema comes from PythonFunctionBodyOnly
return online_sv.svm_predict_batch( input_table, data_col, id_col, model_table, output_table, parallel);
$$ LANGUAGE 'plpythonu';
-- Generate artificial training data
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..(d-1) LOOP
ret[i] = RANDOM() * 40 - 20;
END LOOP;
IF (RANDOM() > 0.5) THEN
ret[d] = 10;
ELSE
ret[d] = -10;
END IF;
RETURN ret;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind2(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..d LOOP
ret[i] = RANDOM() * 5 + 10;
IF (RANDOM() > 0.5) THEN ret[i] = -ret[i]; END IF;
END LOOP;
RETURN ret;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_reg_data(output_table text, num int, dim int) RETURNS VOID AS $$
plpy.execute("drop table if exists " + output_table)
plpy.execute("create table " + output_table + " ( id int, ind float8[], label float8 ) m4_ifdef(`GREENPLUM', `distributed by (id)')")
plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_reg_func(ind)")
$$ LANGUAGE 'plpythonu';
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_reg_func(ind float8[]) RETURNS float8 AS $$
DECLARE
dim int;
BEGIN
dim = array_upper(ind,1);
IF (ind[dim] = 10) THEN RETURN 50; END IF;
RETURN -50;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_cls_data(output_table text, num int, dim int) RETURNS VOID AS $$
plpy.execute("drop table if exists " + output_table);
plpy.execute("create table " + output_table + " ( id int, ind float8[], label float8 ) m4_ifdef(`GREENPLUM', `distributed by (id)')")
plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind(" + str(dim) + "), 0 FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
plpy.execute("UPDATE " + output_table + " SET label = MADLIB_SCHEMA.__svm_target_cl_func(ind)")
$$ LANGUAGE 'plpythonu';
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_cl_func(ind float8[]) RETURNS float8 AS $$
BEGIN
IF (ind[1] > 0 AND ind[2] < 0) THEN RETURN 1; END IF;
RETURN -1;
END
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_nd_data(output_table text, num int, dim int) RETURNS VOID AS $$
plpy.execute("drop table if exists " + output_table);
plpy.execute("create table " + output_table + " ( id int, ind float8[] ) m4_ifdef(`GREENPLUM', `distributed by (id)')")
plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind2(" + str(dim) + ") FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
$$ LANGUAGE 'plpythonu';