blob: c3b672b45d674469f66b426ccdea963b9c320ee0 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file online_sv.sql_in
*
* @brief SQL functions for support vector machines
* @sa For an introduction to Support vector machines (SVMs) and related kernel
* methods, see the module description \ref grp_kernmach.
*
*//* ------------------------------------------------------------------------*/
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_kernmach
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#learn">Regression Learning Function</a></li>
<li><a href="#classify">Classification Learning Function</a></li>
<li><a href="#novelty">Novelty Detection Functions</a></li>
<li><a href="#predict">Prediction Functions</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Generates support vector machines for classification, regression, and novelty detection.
\warning <em> This MADlib method is still in early stage development. There may be some
issues that will be addressed in a future version. Interface and implementation
is subject to change. </em>
Support vector machines (SVMs) and related kernel methods have been among
the most popular and well-studied machine learning techniques of the
past 15 years, with an amazing number of innovations and applications.
In a nutshell, an SVM model \f$f(x)\f$ takes the form of
\f[
f(x) = \sum_i \alpha_i k(x_i,x),
\f]
where each \f$ \alpha_i \f$ is a real number, each \f$ \boldsymbol x_i \f$ is a
data point from the training set (called a support vector), and
\f$ k(\cdot, \cdot) \f$ is a kernel function that measures how "similar" two
objects are. In regression, \f$ f(\boldsymbol x) \f$ is the regression function
we seek. In classification, \f$ f(\boldsymbol x) \f$ serves as
the decision boundary; so for example in binary classification, the predictor
can output class 1 for object \f$x\f$ if \f$ f(\boldsymbol x) \geq 0 \f$, and class
2 otherwise.
In the case when the kernel function \f$ k(\cdot, \cdot) \f$ is the standard
inner product on vectors, \f$ f(\boldsymbol x) \f$ is just an alternative way of
writing a linear function
\f[
f'(\boldsymbol x) = \langle \boldsymbol w, \boldsymbol x \rangle,
\f]
where \f$ \boldsymbol w \f$ is a weight vector having the same dimension as
\f$ \boldsymbol x \f$. One of the key points of SVMs is that we can use more
fancy kernel functions to efficiently learn linear models in high-dimensional
feature spaces, since \f$ k(\boldsymbol x_i, \boldsymbol x_j) \f$ can be
understood as an efficient way of computing an inner product in the feature
space:
\f[
k(\boldsymbol x_i, \boldsymbol x_j)
= \langle \phi(\boldsymbol x_i), \phi(\boldsymbol x_j) \rangle,
\f]
where \f$ \phi(\boldsymbol x) \f$ projects \f$ \boldsymbol x \f$ into a
(possibly infinite-dimensional) feature space.
There are many algorithms for learning kernel machines. This module
implements the class of online learning with kernels algorithms
described in Kivinen et al. [1]. It also includes the incremental gradient
descent (IGD) method Feng et al. [3] for learning linear SVMs with the Hinge
loss \f$l(z) = \max(0, 1-z)\f$. See also the book Scholkopf and Smola [2] for
more details of SVMs in general.
The IGD implementation is based on Bismarck project in University of Wisconsin
(http://hazy.cs.wisc.edu/hazy/victor/bismarck/). The methods introduced in [1]
are implemented according to their original descriptions, except that
we only update the support vector model when we make a significant
error. The original algorithms in [1] update the support vector model at
every step, even when no error was made, in the name of
regularization. For practical purposes, and this is verified
empirically to a certain degree, updating only when necessary is both
faster and better from a learning-theoretic point of view, at least in
the i.i.d. setting.
Methods for classification, regression and novelty detection are
available. Multiple instances of the algorithms can be executed
in parallel on different subsets of the training data. The resultant
support vector models can then be combined using standard techniques
like averaging or majority voting.
Training data points are accessed via a table or a view. The support
vector models can also be stored in tables for fast execution.
@anchor learn
@par Regression Learning Function
Regression learning is achieved through the following function:
<pre class="syntax">
svm_regression( input_table,
model_table,
parallel,
kernel_func,
verbose DEFAULT false,
eta DEFAULT 0.1,
nu DEFAULT 0.005,
slambda DEFAULT 0.05,
kernel_param DEFAULT 1.0
)</pre>
For classification and regression, the training table/view is expected to be of the following form (the array size of <em>ind</em> must not be greater than 102,400.):\n
<pre>{TABLE|VIEW} input_table (
...
id INT,
ind FLOAT8[],
label FLOAT8,
...
)</pre>
For novelty detection, the label field is not required.
Also note that the column names of input_table requires to be exactly the same as described above. This limitation will be removed when this module graduates from early development stage.
@anchor classify
@par Classification Learning Function
Classification learning is achieved through the following two functions:
- Learn linear SVM(s) using IGD [3].
<pre class="syntax">
lsvm_classification( input_table,
model_table,
parallel DEFAULT false,
verbose DEFAULT false,
eta DEFAULT 0.1,
reg DEFAULT 0.001,
max_iter DEFAULT 100
)
</pre>
Note that, as any gradient descent methods, IGD will converge with a greater
eta (stepsize), thus faster, if the input training data is well-conditioned.
We highly recommend user to perform data preparation, such that the mean value
of each feature column is 0 and standard error is 1, and append an extra
feature with constant value 1 for intercept term. We plan to provide a
function for this when this module graduates from early stage development.
- Learn linear or non-linear SVM(s) using the method described in [1].
<pre class="syntax">
svm_classification( input_table,
model_table,
parallel,
kernel_func,
verbose DEFAULT false,
eta DEFAULT 0.1,
nu DEFAULT 0.005,
kernel_param DEFAULT 1.0
)
</pre>
@anchor novelty
@par Novelty Detection Function
Novelty detection is achieved through the following function:
<pre class="syntax">
svm_novelty_detection( input_table,
model_table,
parallel,
kernel_func,
verbose DEFAULT false,
eta DEFAULT 0.1,
nu DEFAULT 0.005,
kernel_param DEFAULT 1.0
)
</pre>
Assuming the model_table parameter takes on value 'model', each learning function will produce two tables
as output: 'model' and 'model_param'.
The first contains the support vectors of the model(s) learned.
The second contains the parameters of the model(s) learned, which include information like the kernel function
used and the value of the intercept, if there is one.
@anchor predict
@par Prediction Functions
- To make predictions on a single data point x using a single model learned previously, we use the function
<pre class="syntax">
svm_predict_batch( input_table,
data_col,
id_col,
model_table,
output_table,
parallel
)
</pre>
If the \c parallel parameter is true, then each data point in the input table will have multiple
predicted values corresponding to the number of models learned in
- If the model is produced by the lsvm_classification() function, use
the following prediction function instead.
<pre class="syntax">
lsvm_predict_batch( input_table,
data_col,
id_col,
model_table,
output_table,
)
</pre>
- Note that, to make predictions on a subset of data points stored in a table,
a separated view or table needs to be created ahead of time:
<pre class="example">
-- create subset as a view
CREATE VIEW subset AS SELECT * FROM input_table WHERE id <= 100;
-- prediction on the subset
SELECT svm_predict_batch('subset', 'ind', 'id',
'svm_model', 'subset_svm_predict', false);
-- prediction using linear SVMs
SELECT lsvm_predict_batch('subset', 'ind', 'id',
'lsvm_model', 'subset_lsvm_predict');
</pre>
@anchor notes
@par Notes
The <tt>kernel_func</tt> argument of <tt>svm_classification</tt>,
<tt>svm_regression</tt>, and <tt>svm_novelty_detection</tt> can only accept a
kernel function in the following form:
<pre class="syntax">
FLOAT8 kernel_function(FLOAT8[], FLOAT8[], FLOAT8)
</pre>
The first two parameters are feature vectors, and the third one is a control
parameter for the kernel function. The value of the control parameter must be
set throught the <tt>kernel_param</tt> argument of <tt>svm_classification</tt>,
<tt>svm_regression</tt>, and <tt>svm_novelty_detection</tt>.
Currently, three kernel functions have been implemented: linear or dot product
(\ref svm_dot), polynomial (\ref svm_polynomial) and Gaussian (\ref
svm_gaussian) kernels. Note that for the dot product kernel, it actually only
requires two FLOAT8[] parameters. To be compliant with the requirements for the
kernel function, we have an overloaded version of <tt>svm_dot</tt> which
accepts two FLOAT8[] and one FLOAT8 and returns a FLOAT8, but the FLOAT8 parameter
is simply a placeholder and will be ignored.
With the HAWQ database, only the above pre-built kernel functions can be
used. With the Greenplum database and PostgreSQL database, one can use any
user-defined function as long as it conforms to the requirements for the kernel
function.
@anchor examples
@examp
As a general first step, prepare and populate an input table/view with the following structure:
<pre class="example">
CREATE TABLE/VIEW my_schema.my_input_table(
id INT, -- point ID
ind FLOAT8[], -- data point
label FLOAT8 -- label of data point
);
</pre>
The label field is not required for novelty detection.
<strong>Example usage for regression</strong>:
-# Randomly generate 1000 5-dimensional data labelled using the simple target function.
<pre class="example">
t(x) = if x[5] = 10 then 50 else if x[5] = -10 then 50 else 0;
</pre>
and store that in the my_schema.my_train_data table as follows:
<pre class="example">
SELECT madlib.svm_generate_reg_data(
'my_schema.my_train_data', 1000, 5);
</pre>
-# Learn a regression model and store the resultant model under the name 'myexp'.
<pre class="example">
SELECT madlib.svm_regression('my_schema.my_train_data',
'myexp1', false, 'madlib.svm_dot');
</pre>
-# To learn multiple support vector models, we replace the learning step above by
<pre class="example">
SELECT madlib.svm_regression('my_schema.my_train_data',
'myexp2', true, 'madlib.svm_dot');
</pre>
-# We can also predict the labels of data points stored in a table. For example, we can execute the following:
<pre class="example">
-- prepare test data
CREATE TABLE madlib.svm_reg_test AS
SELECT id, ind
FROM my_schema.my_train_data
LIMIT 20;
-- prediction using a single model
SELECT madlib.svm_predict_batch('madlib.svm_reg_test', 'ind', 'id',
'myexp1', 'madlib.svm_reg_output1', false);
SELECT * FROM madlib.svm_reg_output1;
-- prediction using multiple models
SELECT madlib.svm_predict_batch('madlib.svm_reg_test', 'ind', 'id',
'myexp2', 'madlib.svm_reg_output2', true);
SELECT * FROM madlib.svm_reg_output2;
</pre>
<strong>Example usage for classification:</strong>
-# Randomly generate training and testing data labelled by the simple target function.
<pre class="example">
t(x) = if x[1] > 0 and x[2] < 0 then 1 else -1;
</pre>
and store that in tables as follows:
<pre class="example">
SELECT madlib.svm_generate_cls_data(
'my_schema.my_train_data', 2000, 5);
SELECT madlib.svm_generate_cls_data(
'my_schema.my_test_data', 3000, 5);
</pre>
-# Learn a classification model and store the resultant model the table 'myexpc'.
<pre class="example">
SELECT madlib.svm_classification('my_schema.my_train_data',
'myexpc', false, 'madlib.svm_dot');
</pre>
-# Start using the model to predict the labels of testing data points.
<pre class="example">
SELECT madlib.svm_predict_batch('my_schema.my_test_data', 'ind', 'id',
'myexpc', 'my_schema.svm_cls_output1', false);
</pre>
-# To learn multiple support vector models, replace the model-building and prediction steps above.
<pre class="example">
-- training
SELECT madlib.svm_classification('my_schema.my_train_data',
'myexpc', true, 'madlib.svm_dot');
-- predicting
SELECT madlib.svm_predict_batch('my_schema.my_test_data', 'ind', 'id',
'myexpc', 'my_schema.svm_cls_output1', true);
</pre>
-# To learn a linear support vector model using IGD [3], replace the model-building and prediction steps.
<pre class="example">
-- training
SELECT madlib.lsvm_classification(
'my_schema.my_train_data', 'my_lsvm');
-- predicting
SELECT madlib.lsvm_predict_batch('my_schema.my_test_data',
'ind', 'id', 'my_lsvm', 'my_lsvm_predict');
</pre>
<strong>Example usage for novelty detection:</strong>
-# Randomly generate 100 2-dimensional data (the normal cases) and store that in the my_schema.my_train_data table.
<pre class="example">
SELECT madlib.svm_generate_nd_data(
'my_schema.my_train_data', 100, 2);
</pre>
-# Learning and predicting using a single novelty detection model:
<pre class="example">
SELECT madlib.svm_novelty_detection( 'my_schema.my_train_data',
'myexpnd',
false,
'madlib.svm_dot'
);
SELECT * FROM myexpnd;
</pre>
-# Learning and predicting using multiple models can be done as follows:
<pre class="example">
SELECT madlib.svm_novelty_detection( 'my_schema.my_train_data',
'myexpnd',
true,
'madlib.svm_dot'
);
SELECT * FROM myexpnd;
</pre>
<strong>Model cleanup:</strong>
To drop all tables pertaining to the model, use:
<pre class="example">
SELECT svm_drop_model('model_table');
</pre>
@anchor literature
@literature
[1] Jyrki Kivinen, Alexander J. Smola, and Robert C. Williamson: <em>Online
Learning with Kernels</em>, IEEE Transactions on Signal Processing, 52(8),
2165-2176, 2004.
[2] Bernhard Scholkopf and Alexander J. Smola: <em>Learning with Kernels:
Support Vector Machines, Regularization, Optimization, and Beyond</em>,
MIT Press, 2002.
[3] X. Feng, A. Kumar, B. Recht, and C. R&eacute;: <em>Towards a unified architecture
for in-RDBMS analytics</em>, In SIGMOD Conference, 2012.
@anchor related
@par Related Topics
File online_sv.sql_in and lsvm.sql_in documenting the SQL functions.
@internal
@sa namespace online_sv (documenting the implementation in Python)
@endinternal
*/
-- The following is the structure to record the results of a learning process.
-- We work with arrays of float8 for now; we'll extend the code to work with sparse vectors next.
--
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_model_rec CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_model_rec AS (
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
rho float8, -- classification margin
b float8, -- classifier offset
nsvs int, -- number of support vectors
ind_dim int, -- the dimension of the individuals
weights float8[], -- the weight of the support vectors
individuals float8[], -- the array of support vectors, represented as a 1-D array
kernel_oid oid -- OID of kernel function
);
-- The following is the return type of a regression learning process
--
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_reg_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_reg_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
epsilon float8, -- the size of the epsilon tube around the hyperplane, adaptively adjusted by algorithm
b float8, -- classifier offset
nsvs int -- number of support vectors
);
-- The following is the return type of a classification learning process
--
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_cls_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_cls_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
cum_err float8, -- cumulative error
rho float8, -- classification margin
b float8, -- classifier offset
nsvs int -- number of support vectors
);
-- The following is the return type of a novelty detection learning process
--
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_nd_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_nd_result AS (
model_table text, -- table where the model is stored
model_name text, -- model name
inds int, -- number of individuals processed
rho float8, -- classification margin
nsvs int -- number of support vectors
);
-- The type for representing support vectors
--
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_support_vector CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_support_vector AS ( id text, weight float8, sv float8[] );
-- Kernel functions are a generalisation of inner products.
-- They provide the means by which we can extend linear machines to work in non-linear transformed feature spaces.
-- Here are a few standard kernels: dot product, polynomial kernel, Gaussian kernel.
--
/**
* @brief Dot product kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @return Returns dot product of the two data points.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_dot(x float8[], y float8[]) RETURNS float8
AS 'MODULE_PATHNAME', 'array_dot' LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_dot(x float8[], y float8[], placeholder float8) RETURNS float8
AS 'MODULE_PATHNAME', 'array_dot' LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Polynomial kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @param degree The degree \f$ d \f$
* @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=(\boldsymbol x \cdot \boldsymbol y)^d \f$
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_polynomial(x float8[], y float8[], degree float8) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_polynomial' LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @brief Gaussian kernel function
*
* @param x The data point \f$ \boldsymbol x \f$
* @param y The data point \f$ \boldsymbol y \f$
* @param gamma The spread \f$ \gamma \f$
* @return Returns \f$ K(\boldsymbol x,\boldsymbol y)=exp(-\gamma || \boldsymbol x \cdot \boldsymbol y ||^2 ) \f$
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_gaussian(x float8[], y float8[], gamma float8) RETURNS float8
AS 'MODULE_PATHNAME', 'svm_gaussian' LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict_sub(
int, int, float8[], float8[], float8[], text, float8
) RETURNS float8 AS
'MODULE_PATHNAME', 'svm_predict_sub'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
svs MADLIB_SCHEMA.svm_model_rec,
ind float8[],
kernel text,
kernel_param float8
) RETURNS float8 AS $$
SELECT MADLIB_SCHEMA.svm_predict_sub($1.nsvs, $1.ind_dim, $1.weights, $1.individuals, $2, $3, $4);
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
-- This is the main online support vector regression learning algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_reg_update(
svs MADLIB_SCHEMA.svm_model_rec,
ind FLOAT8[],
label FLOAT8,
kernel TEXT,
eta FLOAT8,
nu FLOAT8,
slambda FLOAT8,
kernel_param FLOAT8
) RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME', 'svm_reg_update'
LANGUAGE C STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svm_reg_agg(
float8[], float8, text, float8, float8, float8, float8
);
CREATE AGGREGATE MADLIB_SCHEMA.svm_reg_agg(
float8[], float8, text, float8, float8, float8, float8
) (
sfunc = MADLIB_SCHEMA.svm_reg_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This is the main online support vector classification learning algorithm.
-- The function updates the support vector model as it processes each new training example.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_cls_update(
svs MADLIB_SCHEMA.svm_model_rec,
ind FLOAT8[],
label FLOAT8,
kernel TEXT,
eta FLOAT8,
nu FLOAT8,
kernel_param FLOAT8
) RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME', 'svm_cls_update'
LANGUAGE C STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svm_cls_agg(
float8[], float8, text, float8, float8, float8);
CREATE AGGREGATE MADLIB_SCHEMA.svm_cls_agg(
float8[], float8, text, float8, float8, float8
) (
sfunc = MADLIB_SCHEMA.svm_cls_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This is the main online support vector novelty detection algorithm.
-- The function updates the support vector model as it processes each new training example.
-- In contrast to classification and regression, the training data points have no labels.
-- This function is wrapped in an aggregate function to process all the training examples stored in a table.
--
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_nd_update(
svs MADLIB_SCHEMA.svm_model_rec,
ind FLOAT8[],
kernel TEXT,
eta FLOAT8,
nu FLOAT8,
kernel_param FLOAT8)
RETURNS MADLIB_SCHEMA.svm_model_rec AS
'MODULE_PATHNAME', 'svm_nd_update'
LANGUAGE C STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.svm_nd_agg(
float8[], text, float8, float8, float8);
CREATE AGGREGATE MADLIB_SCHEMA.svm_nd_agg(
float8[], text, float8, float8, float8
) (
sfunc = MADLIB_SCHEMA.svm_nd_update,
stype = MADLIB_SCHEMA.svm_model_rec,
initcond = '(0,0,0,0,0,0,0,{},{},0)'
);
-- This function stores a MADLIB_SCHEMA.svm_model_rec stored in model_temp_table into the model_table.
--
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_store_model(
model_temp_table TEXT,
model_name TEXT,
model_table TEXT
) RETURNS VOID AS $$
sql = "SELECT COUNT(*) FROM " + model_temp_table + " WHERE id = \'" + model_name + "\'";
temp = plpy.execute(sql);
if (temp[0]['count'] == 0):
plpy.error("No support vector model with name " + model_name + " found.");
sql = "SELECT (model).ind_dim, (model).nsvs" \
+ " FROM " + model_temp_table + " WHERE id = '" + model_name + "'";
rv = plpy.execute(sql);
myind_dim = rv[0]['ind_dim'];
mynsvs = rv[0]['nsvs'];
if (mynsvs == 0):
plpy.error("The specified model has no support vectors and therefore not processed");
idx = 0;
for i in range(1,mynsvs+1):
idx = myind_dim * (i-1);
sql = "INSERT INTO " + model_table \
+ " SELECT \'" + model_name + "\', (model).weights[" + str(i) + "], " \
+ " (model).individuals[(" + str(idx+1) + "):(" + str(idx) + "+" + str(myind_dim) + ")] " \
+ " FROM " + model_temp_table + " WHERE id = \'" + model_name + "\' LIMIT 1";
plpy.execute(sql);
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Drops all tables pertaining to a model
*
* @param model_table The table to be dropped.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_drop_model(model_table TEXT) RETURNS VOID AS $$
plpy.execute("drop table if exists " + model_table)
plpy.execute("drop table if exists " + model_table + "_param")
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
DROP TYPE IF EXISTS MADLIB_SCHEMA.svm_model_pr CASCADE;
CREATE TYPE MADLIB_SCHEMA.svm_model_pr AS ( model text, prediction float8 );
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_regression(input_table text, model_table text, parallel bool, kernel_func text)
RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_regression( schema_madlib, input_table, model_table, parallel, kernel_func);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param slambda Regularisation parameter
* @param kernel_param Default parameter for kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_regression(
input_table text,
model_table text,
parallel bool,
kernel_func text,
verbose bool,
eta float8,
nu float8,
slambda float8,
kernel_param float8)
RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_regression(schema_madlib, input_table, model_table,
parallel, kernel_func, verbose, eta, nu,
slambda, kernel_param);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This is the support vector regression function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param slambda Regularisation parameter
* @param kernel_param Default parameter for kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_regression().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_regression(
input_table text,
model_table text,
parallel bool,
kernel_func text,
verbose bool,
eta float8,
nu float8,
slambda float8
)RETURNS SETOF MADLIB_SCHEMA.svm_reg_result
AS $$
SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, 1.0)
$$ LANGUAGE 'sql'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This is the support vector classification function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_classification().
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
input_table TEXT,
model_table TEXT,
parallel BOOL,
kernel_func TEXT,
kernel_param DOUBLE PRECISION
) RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_classification(schema_madlib, input_table,
model_table, parallel, kernel_func,
kernel_param=kernel_param);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
input_table TEXT,
model_table TEXT,
parallel BOOL,
kernel_func TEXT
) RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, 1.0);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
----------------------------------------------------------------------
/**
* @brief This is the support vector classification function
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param kernel_param Default parameter for kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_classification().
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
input_table TEXT,
model_table TEXT,
parallel BOOL,
kernel_func TEXT,
verbose BOOL,
eta FLOAT8,
nu FLOAT8,
kernel_param DOUBLE PRECISION
) RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_classification(schema_madlib, input_table, model_table, parallel,
kernel_func, verbose, eta, nu, kernel_param);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
--
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
input_table TEXT,
model_table TEXT,
parallel BOOL,
kernel_func TEXT,
verbose BOOL,
eta FLOAT8,
nu FLOAT8
) RETURNS SETOF MADLIB_SCHEMA.svm_cls_result
AS $$
SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, 1.0);
$$ LANGUAGE sql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
----------------------------------------------------------------------
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_novelty_detection(input_table text, model_table text, parallel bool, kernel_func text)
RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_novelty_detection( schema_madlib, input_table, model_table, parallel, kernel_func);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param kernel_param Default parameter for kernel function
* @param kernel_param
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_novelty_detection(
input_table text,
model_table text,
parallel bool,
kernel_func text,
verbose bool,
eta float8,
nu float8,
kernel_param float8
) RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_novelty_detection(schema_madlib, input_table,
model_table, parallel,
kernel_func, verbose, eta, nu,
kernel_param);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief This is the support vector novelty detection function.
*
* @param input_table The name of the table/view with the training data
* @param model_table The name of the table under which we want to store the learned model
* @param parallel A flag indicating whether the system should learn multiple models in parallel
* @param kernel_func Kernel function
* @param verbose Verbosity of reporting
* @param eta Learning rate in (0,1]
* @param nu Compression parameter in (0,1] associated with the fraction of training data that will become support vectors
* @param kernel_param Default parameter for kernel function
* @param kernel_param
* @return A summary of the learning process
*
* @internal
* @sa This function is a wrapper for online_sv::svm_novelty_detection().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_novelty_detection(
input_table text,
model_table text,
parallel bool,
kernel_func text,
verbose bool,
eta float8,
nu float8
) RETURNS SETOF MADLIB_SCHEMA.svm_nd_result
AS $$
SELECT MADLIB_SCHEMA.svm_novelty_detection($1, $2, $3, $4, $5, $6, $7, 1.0)
$$ LANGUAGE 'sql'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Scores the data points stored in a table using a learned support-vector model
*
* @param input_table Name of table/view containing the data points to be scored
* @param data_col Name of column in input_table containing the data points
* @param id_col Name of column in input_table containing the integer identifier of data points
* @param model_table Name of table where the learned model to be used is stored
* @param output_table Name of table to store the results
* @param parallel A flag indicating whether the model to be used was learned in parallel
* @return Textual summary of the algorithm run
*
* @internal
* @sa This function is a wrapper for online_sv::svm_predict_batch().
*/
CREATE OR REPLACE FUNCTION
MADLIB_SCHEMA.svm_predict_batch(
input_table text,
data_col text,
id_col text,
model_table text,
output_table text,
parallel bool)
RETURNS TEXT
AS $$
PythonFunctionBodyOnly(`kernel_machines', `online_sv')
# schema_madlib comes from PythonFunctionBodyOnly
return online_sv.svm_predict_batch( input_table, data_col, id_col, model_table, output_table, parallel);
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- Generate artificial training data
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..(d-1) LOOP
ret[i] = RANDOM() * 40 - 20;
END LOOP;
IF (RANDOM() > 0.5) THEN
ret[d] = 10;
ELSE
ret[d] = -10;
END IF;
RETURN ret;
END
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_random_ind2(d INT) RETURNS float8[] AS $$
DECLARE
ret float8[];
BEGIN
FOR i IN 1..d LOOP
ret[i] = RANDOM() * 5 + 10;
IF (RANDOM() > 0.5) THEN ret[i] = -ret[i]; END IF;
END LOOP;
RETURN ret;
END
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_reg_data(output_table text, num int, dim int) RETURNS VOID AS $$
temp_table = "madlib_temp_" + output_table
plpy.execute("DROP TABLE IF EXISTS " + output_table)
plpy.execute("DROP TABLE IF EXISTS " + temp_table)
plpy.execute("""
CREATE TABLE {temp_table} AS
SELECT
subq.val AS id,
MADLIB_SCHEMA.__svm_random_ind({dim}) AS ind
FROM
(SELECT generate_series(1, {num}) AS val) subq
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')"""
.format(temp_table=temp_table, dim=str(dim), num=str(num)))
plpy.execute("""
CREATE TABLE {output_table} AS
SELECT id, ind, MADLIB_SCHEMA.__svm_target_reg_func(ind) AS label
FROM {temp_table}
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')"""
.format(output_table=output_table, temp_table=temp_table))
plpy.execute("drop table madlib_temp_" + output_table)
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_reg_func(ind float8[]) RETURNS float8 AS $$
DECLARE
dim int;
BEGIN
dim = array_upper(ind,1);
IF (ind[dim] = 10) THEN RETURN 50; END IF;
RETURN -50;
END
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_cls_data(output_table text, num int, dim int) RETURNS VOID AS $$
temp_table = "madlib_temp_" + output_table
plpy.execute("DROP TABLE IF EXISTS " + output_table)
plpy.execute("DROP TABLE IF EXISTS " + temp_table)
plpy.execute("""
CREATE TABLE {temp_table} AS
SELECT
subq.val AS id,
MADLIB_SCHEMA.__svm_random_ind({dim}) AS ind
FROM
(SELECT generate_series(1, {num}) AS val) subq
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')"""
.format(temp_table=temp_table, dim=str(dim), num=str(num)))
plpy.execute("""
CREATE TABLE {output_table} AS
SELECT id, ind, MADLIB_SCHEMA.__svm_target_cl_func(ind) AS label
FROM {temp_table}
m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')"""
.format(output_table=output_table, temp_table=temp_table))
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__svm_target_cl_func(ind float8[]) RETURNS float8 AS $$
BEGIN
IF (ind[1] > 0 AND ind[2] < 0) THEN RETURN 1; END IF;
RETURN -1;
END
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_generate_nd_data(output_table text, num int, dim int) RETURNS VOID AS $$
plpy.execute("drop table if exists " + output_table);
plpy.execute("create table " + output_table + " ( id int, ind float8[] ) m4_ifdef(`__POSTGRESQL__', `', `distributed by (id)')")
plpy.execute("INSERT INTO " + output_table + " SELECT a.val, MADLIB_SCHEMA.__svm_random_ind2(" + str(dim) + ") FROM (SELECT generate_series(1," + str(num) + ") AS val) AS a")
$$ LANGUAGE 'plpythonu'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Normalizes the data stored in a table, and save the normalized data in a new table.
*
* @param input_table Name of table/view containing the data points to be scored
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_data_normalization(input_table TEXT) RETURNS VOID AS $$
output_table = input_table + "_scaled"
plpy.execute("DROP TABLE IF EXISTS " + output_table)
plpy.execute("CREATE TABLE " + output_table + " ( id int, ind float8[], label int ) m4_ifdef(`__POSTGRESQL__', `', `distributed by (id)')")
plpy.execute("INSERT INTO " + output_table + " SELECT id, MADLIB_SCHEMA.svm_normalization(ind), label FROM " + input_table)
plpy.info("output table: %s" % output_table)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');