src/ports/postgres/modules/svm/svm.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- */
 /**
  *
  * @file svm.sql_in
  *
  * @brief SQL functions for SVM (Poisson)
  * @date July 2015
  *
  * @sa For a brief introduction to SVM (Poisson), see the
  *     module description \ref grp_svm.
  *
  */
 /* ----------------------------------------------------------------------- */
 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_svm

 <div class="toc"><b>Contents</b><ul>
 <li class="level1"><a href="#svm_classification">Classification Function</a></li>
 <li class="level1"><a href="#svm_regression">Regression Function</a></li>
 <li class="level1"><a href="#novelty_detection">Novelty Detection</a></li>
 <li class="level1"><a href="#kernel_params">Kernel Parameters</a></li>
 <li class="level1"><a href="#parameters">Other Parameters</a></li>
 <li class="level1"><a href="#predict">Prediction Functions</a></li>
 <li class="level1"><a href="#example">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 <li class="level1"><a href="#related">Related Topics</a></li>
 </ul></div>

 Support vector machines are models for regression and classification
 tasks. SVM models have two particularly desirable features: robustness in the
 presence of noisy data and applicability to a variety of data configurations. At
 its core, a <em>linear</em> SVM model is a hyperplane separating two
 distinct classes of data (in the case of classification problems), in such a way
 that the distance between the hyperplane and the nearest training data point
 (called the <em>margin</em>) is maximized. Vectors that lie on this margin are
 called support vectors. With the support vectors fixed, perturbations of vectors
 beyond the margin will not affect the model; this contributes to the model’s
 robustness.  By substituting a kernel function for the usual inner product, one can
 approximate a large variety of decision boundaries in addition to linear hyperplanes.
 @brief Solves classification and regression problems by separating data with
 a hyperplane or other nonlinear decision boundary.

 @anchor svm_classification
 @par Classification Training Function
 The SVM binary classification training function has the following format:
 <pre class="syntax">
 svm_classification(
     source_table,
     model_table,
     dependent_varname,
     independent_varname,
     kernel_func,
     kernel_params,
     grouping_col,
     params,
     verbose
     )
 </pre>
 \b Arguments
 <DL class="arglist">
   <DT>source_table</DT>
   <DD>TEXT. Name of the table containing the training data.</DD>

   <DT>model_table</DT>
   <DD>TEXT. Name of the output table containing the model. Details of the output
    tables are provided below.
   </DD>

   <DT>dependent_varname</DT>
   <DD> TEXT. Name of the dependent variable column. For classification, this
   column can contain values of any type, but must assume exactly two distinct
   values since only binary classification is currently supported.
   </DD>

   <DT>independent_varname</DT>
   <DD>TEXT. Expression list to evaluate for the
     independent variables. An intercept variable should not be included as part
     of this expression. See 'fit_intercept' in the kernel params for info on
     intercepts.  Please note that expression should be able to be cast
     to DOUBLE PRECISION[].

   <DT>kernel_func (optional)</DT>
   <DD>TEXT, default: 'linear'.
     Type of kernel. Currently three kernel types are supported: 'linear',
     'gaussian', and 'polynomial'. The text can be any subset of the three
     strings; for e.g., kernel_func='ga' will create a Gaussian kernel.
     </DD>

   <DT>kernel_params (optional)</DT>
   <DD>TEXT, defaults: NULL.
     Parameters for non-linear kernel in a comma-separated string of key-value pairs.
     The actual parameters differ depending on the value of \e kernel_func.
     See the description below for details.
   </DD>

   <DT>grouping_col (optional)</DT>
   <DD>TEXT, default: NULL. An expression list used to group
     the input dataset into discrete groups, which results in running one model per group.
     Similar to the SQL "GROUP BY" clause. When this value is NULL, no
     grouping is used and a single model is generated.  Please note that
     cross validation is not supported if grouping is used.</DD>

   <DT>params (optional)</DT>
   <DD>TEXT, default: NULL.
     Parameters for optimization and regularization in a comma-separated string
     of key-value pairs. If a list of values is provided, then cross-validation
     will be performed to select the \e best value from the list. See the
     description below for details.
   </DD>

   <DT>verbose (optional)</DT>
   <DD>BOOLEAN default: FALSE.
     Verbose output of the results of training.</DD>
 </DL>

 <b>Output tables</b>
 <br>
     The model table produced by SVM contains the following columns:
     <table class="output">
       <tr>
         <th>coef</th>
         <td>FLOAT8. Vector of coefficients.</td>
       </tr>
       <tr>
         <th>grouping_key</th>
         <td>TEXT Identifies the group to which the datum belongs.</td>
       </tr>
       <tr>
         <th>num_rows_processed</th>
         <td>BIGINT. Numbers of rows processed.</td>
       </tr>
       <tr>
         <th>num_rows_skipped</th>
         <td>BIGINT. Numbers of rows skipped due to missing values or failures.</td>
       </tr>
       <tr>
         <th>num_iterations</th>
         <td>INTEGER. Number of iterations completed by stochastic gradient descent
         algorithm. The algorithm either converged in this number of iterations
         or hit the maximum number specified in the optimization parameters. </td>
       </tr>
       <tr>
         <th>loss</th>
         <td>FLOAT8. Value of the objective function of SVM, expressed as an
         average loss per row over the \e source_table.  See Technical Background
         section below for more details.</td>
       </tr>
       <tr>
         <th>norm_of_gradient</th>
         <td>FLOAT8. Value of the L2-norm of the (sub)-gradient of the objective function.</td>
       </tr>
       <tr>
         <th>__dep_var_mapping</th>
         <td>TEXT[]. Vector of dependent variable labels. The first entry
         corresponds to -1 and the second to +1.  For internal use only.</td>
       </tr>
     </table>

     An auxiliary table named \<model_table\>_random is created if the kernel is
     not linear. It contains data needed to embed test data into a random feature
     space (see references [2,3]). This data is used internally by svm_predict
     and not meaningful on its own to the user, so you can ignore it.

 A summary table named \<model_table\>_summary is also created, which has the following columns:
     <table class="output">
     <tr>
         <th>method</th>
         <td>'svm'</td>
     </tr>
     <tr>
         <th>version_number</th>
         <td>Version of MADlib which was used to generate the model.</td>
     </tr>
     <tr>
         <th>source_table</th>
         <td>The data source table name.</td>
     </tr>
     <tr>
         <th>model_table</th>
         <td>The model table name.</td>
     </tr>
     <tr>
         <th>dependent_varname</th>
         <td>The dependent variable.</td>
     </tr>
     <tr>
         <th>independent_varname</th>
         <td>The independent variables.</td>
     </tr>
     <tr>
         <th>kernel_func</th>
         <td>The kernel function.</td>
     </tr>
     <tr>
         <th>kernel_parameters</th>
         <td>The kernel parameters, as well as random feature map data.</td>
     </tr>
     <tr>
         <th>grouping_col</th>
         <td>Columns on which to group.</td>
     </tr>
     <tr>
         <th>optim_params</th>
         <td>A string containing the optimization parameters.</td>
     </tr>
     <tr>
         <th>reg_params</th>
         <td>A string containing the regularization parameters.</td>
     </tr>
     <tr>
         <th>num_all_groups</th>
         <td>Number of groups in SVM training.</td>
     </tr>
     <tr>
         <th>num_failed_groups</th>
         <td>Number of failed groups in SVM training.</td>
     </tr>
     <tr>
       <th>total_rows_processed</th>
       <td>Total numbers of rows processed in all groups.</td>
     </tr>
     <tr>
       <th>total_rows_skipped</th>
       <td>Total numbers of rows skipped in all groups due to missing
       values or failures.</td>
     </tr>
    </table>

    If cross validation is used, a table is created with a
    user specified name having the following columns:
     <table class="output">
     <tr>
         <th>...</th>
         <td>Names of cross validation parameters</td>
     </tr>
     <tr>
         <th>mean_score</th>
         <td>Mean value of accuracy when predicted on the
         validation fold, averaged over all folds and all rows.</td>
     </tr>
     <tr>
         <th>std_dev_score</th>
         <td>Standard deviation of accuracy when predicted on the
         validation fold, averaged over all folds and all rows.</td>
     </tr>
    </table>

 @anchor svm_regression
 @par Regression Training Function
 The SVM regression training function has the following format:
 <pre class="syntax">
 svm_regression(source_table,
     model_table,
     dependent_varname,
     independent_varname,
     kernel_func,
     kernel_params,
     grouping_col,
     params,
     verbose
     )
 </pre>

 \b Arguments

 Specifications for regression are largely the same as for classification. In the
 model table, there is no dependent variable mapping. The following
 arguments have specifications which differ from svm_classification:
 <DL class="arglist">
 <DT>dependent_varname</DT>
   <DD>TEXT. Name of the dependent variable column. For regression, this column
   can contain only values or expressions that can be cast to DOUBLE PRECISION.
   Otherwise, an error will be thrown.
   </DD>
 <DT>params (optional)</DT>
 <DD>TEXT, default: NULL.
 The parameters \e epsilon and \e eps_table are only meaningful for regression.
 See description below for more details.
 </DD>
 </DL>

 @anchor novelty_detection
 @par Novelty Detection Training Function
 The novelty detection function is a one-class SVM classifier, and has the following format:
 <pre class="syntax">
 svm_one_class(
     source_table,
     model_table,
     independent_varname,
     kernel_func,
     kernel_params,
     grouping_col,
     params,
     verbose
     )
 </pre>
 \b Arguments

 Specifications for novelty detection are largely the same as for classification,
 except the dependent variable name is not specified. The model table is the same
 as that for classification.

 @anchor kernel_params
 @par Kernel Parameters
 Kernel parameters are supplied in a string containing a comma-delimited
 list of name-value pairs. All of these named parameters are optional, and
 their order does not matter. You must use the format "<param_name> = <value>"
 to specify the value of a parameter, otherwise the parameter is ignored.

 <DL class="arglist">
 <DT><i>Parameters common to all kernels</i></dt><dd></dd>
 <DT>fit_intercept</dt>
 <DD>Default: True. The parameter \e fit_intercept is an indicator to add an
 intercept to the \e independent_varname array expression. The intercept is added
 to the end of the feature list - thus the last element of the coefficient list
 is the intercept.
 </DD>
 <DT>n_components</DT>
 <DD>Default: max(100, 2*num_features). The dimensionality of the transformed feature space.
 A larger value lowers the variance of the estimate of the kernel but requires
 more memory and takes longer to train.</DD>
 @note
 Setting the \e n_components kernel parameter properly is important to
 generate an accurate decision boundary and can make the difference between a
 good model and a useless model. Try increasing the value of \e n_components
  if you are not getting an accurate decision boundary. This parameter arises
 from using the primal formulation, in which we map data into a relatively
 low-dimensional randomized feature space [2, 3]. The parameter
 \e n_components is the dimension of that feature space.  We use the primal in
 MADlib to support scaling to large data sets, compared to R or other single
 node implementations  that use the dual formulation and hence do not have this
 type of mapping, since the the dimensionality of  the transformed feature
 space in the dual is effectively infinite.

 <DT>random_state</DT>
 <DD>Default: 1. Seed used by a random number generator. </DD>
 </DL>

 <DL class="arglist">
 <DT><i>Parameters for 'gaussian' kernel</i></dt><dd></dd>
 <DT>gamma</dt>
 <DD> Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function
 kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma
 is critical to the performance of kernel machine; e.g., while a large \e gamma
 tends to cause overfitting, a small \e gamma will make the model too constrained
 to capture the complexity of the data.
 </DD>
 </DL>

 <DL class="arglist">
 <DT><i>Parameters for 'polynomial' kernel</i></dt><dd></dd>
 <DT>coef0</dt>
 <DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$.
 Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form.
 </DD>
 <DT>degree</dt>
 <DD>Default: 3. The parameter \f$r\f$ in \f$ (\langle x,y\rangle + q)^r \f$.
 </DD>
 </DL>


 @anchor parameters
 @par Other Parameters
 Parameters in this section are supplied in the \e params argument as a string
 containing a comma-delimited list of name-value pairs. All of these named
 parameters are optional, and their order does not matter. You must use the
 format "<param_name> = <value>" to specify the value of a parameter, otherwise
 the parameter is ignored.

 Hyperparameter optimization can be carried out using the built-in cross
 validation mechanism, which is activated by assigning a value greater than 1 to
 the parameter \e n_folds in \e params.
 Please note that cross validation is not
 supported if grouping is used.

 The values of a parameter to cross validate should be provided in a list. For
 example, if one wanted to regularize with the L1 norm and use a lambda value
 from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, norm=L1,
 n_folds=10' in \e params. Note that the use of '{}' and '[]' are both valid
 here.
 @note
 Note that not all of the parameters below can be cross-validated. For
 parameters where cross validation is allowed, their default values are presented
 in list format; e.g., [0.01].

 <pre class="syntax">
   'init_stepsize = &lt;value>,
    decay_factor = &lt;value>,
    max_iter = &lt;value>,
    tolerance = &lt;value>,
    lambda = &lt;value>,
    norm = &lt;value>,
    epsilon = &lt;value>,
    eps_table = &lt;value>,
    validation_result = &lt;value>,
    n_folds = &lt;value>,
    class_weight = &lt;value>'
 </pre>
 \b Parameters
 <DL class="arglist">

 <DT>init_stepsize</dt>
 <DD>Default: [0.01].
 Also known as the initial learning rate. A small value is usually desirable to
 ensure convergence, while a large value provides more room for progress during
 training. Since the best value depends on the condition number of the data, in
 practice one often searches in an exponential grid using built-in cross
 validation; e.g., "init_stepsize = [1, 0.1, 0.001]". To reduce training time, it
 is common to run cross validation on a subsampled dataset, since this usually
 provides a good estimate of the condition number of the whole dataset.  Then the
 resulting \e init_stepsize can be run on the whole dataset.

 </DD>

 <DT>decay_factor</DT>
 <DD>Default: [0.9]. Control the learning rate schedule: 0 means constant rate;
 <-1 means inverse scaling, i.e., stepsize = init_stepsize / iteration; > 0 means
 <exponential decay, i.e., stepsize = init_stepsize * decay_factor^iteration.
 </DD>

 <DT>max_iter</dt>
 <DD>Default: [100]. The maximum number of iterations allowed.
 </DD>

 <DT>tolerance</dt>
 <DD>Default: 1e-10. The criterion to end iterations. The training stops whenever
 the difference between the training models of two consecutive iterations is
 smaller than \e tolerance or the iteration number is larger than \e max_iter.
 </DD>

 <DT>lambda</dt>
 <DD>Default: [0.01]. Regularization parameter.  Must be non-negative.
 </DD>

 <DT>norm</dt>
 <DD>Default: 'L2'. Name of the regularization, either 'L2' or 'L1'.
 </DD>

 <DT>epsilon</dt>
 <DD>Default: [0.01].
 Determines the \f$\epsilon\f$ for \f$\epsilon\f$-regression. Ignored during classification.
 When training the model, differences of less than \f$\epsilon\f$ between estimated labels
 and actual labels are ignored. A larger \f$\epsilon\f$ will yield a model
 with fewer support vectors, but will not generalize as well to future data.
 Generally, it has been suggested that epsilon should increase with noisier
 data, and decrease with the number of samples. See [5].
 </DD>

 <DT>eps_table</dt>
 <DD>Default: NULL.
 Name of the input table that contains values of epsilon for different groups.
 Ignored when \e grouping_col is NULL.  Define this input table if you want
 different epsilon values for different groups.  The table consists of a column
 named \e epsilon which specifies the epsilon values, and one or more columns for
 \e grouping_col. Extra groups are ignored, and groups not present in this table
 will use the epsilon value specified in parameter \e epsilon.
 </DD>

 <DT>validation_result</dt>
 <DD>Default: NULL.
 Name of the table to store the cross validation scores.
 This table is only created if the name is not NULL.
 The cross validation scores are the mean and standard deviation
 of the accuracy when predicted on the validation fold,
 averaged over all folds and all rows.  For classification, the accuracy
 metric used is the ratio of correct classifications.  For regression, the
 accuracy metric used is the negative of mean squared error (negative to
 make it a concave problem, thus selecting \e max means the highest accuracy).
 </DD>

 <DT>n_folds</dt>
 <DD>Default: 0.
 Number of folds (k). Must be at least 2 to activate cross validation.
 If a value of k > 2 is specified, each fold is then used as a validation set once,
 while the other k - 1 folds form the training set.
 </DD>

 <DT>class_weight</dt>
 <DD>Default: NULL for classification, 'balanced' for one-class novelty detection,
 this param is not applicable for regression.

 Set the weight for the classes. If not given (empty/NULL), all classes are set to have
 equal weight. If 'class_weight = balanced', values of y are automatically adjusted
 as inversely proportional to class frequencies in the input data i.e. the weights
 are set as n_samples / (2 * bincount(y)).

 Alternatively, 'class_weight' can be a mapping, giving the weight for each class.
 E.g., for dependent variable values 'a' and 'b', the 'class_weight' might be
 {a: 1, b: 3}.  This gives three times the weight to observations with class value
 'b' compared to 'a'.  (In the SVM algorithm, this translates into observations
 with class value 'b' contributing 3x to learning in the stochastic gradient step
 compared to 'a'.)

 For regression, the class weights are always one.
 </DD>
 </DL>

 @anchor predict
 @par Prediction Function
 The prediction function is used to estimate the conditional mean given a new
 predictor. The same syntax is used for classification, regression and novelty
 detection:
 <pre class="syntax">
 svm_predict(model_table,
             new_data_table,
             id_col_name,
             output_table)
 </pre>

 \b Arguments
 <DL class="arglist">
   <DT>model_table</DT>
   <DD>TEXT. Model table produced by the training function.</DD>

   <DT>new_data_table</DT>
   <DD>TEXT. Name of the table containing the prediction data. This table is expected
   to contain the same features that were used during training. The table should
   also contain id_col_name used for identifying each row.</DD>

   <DT>id_col_name</DT>
   <DD>TEXT. The name of the id column in the input table.</DD>

   <DT>output_table</DT>
   <DD>TEXT. Name of the table where output predictions are written. If this
 table name is already in use, then an error is returned.  Table contains:</DD>
     <table class="output">
       <tr>
         <th>id</th>
         <td>Gives the 'id' for each prediction, corresponding to each row from the new_data_table.</td>
       </tr>
       <tr>
         <th>prediction</th>
         <td>Provides the prediction for each row in new_data_table.
         For regression this would be the same as decision_function. For classification,
         this will be one of the dependent variable values.</td>
       </tr>
       <tr>
         <th>decision_function</th>
         <td>Provides the distance between each point and the separating hyperplane.</td>
       </tr>
 </DL>
 </table>
 @anchor example
 @par Examples

 <h4>Classification</h4>

 -#  Create an input data set.
 <pre class="example">
 DROP TABLE IF EXISTS houses;
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
 INSERT INTO houses VALUES
   (1 ,  590 ,       2 ,    1 ,  50000 ,  770 , 22100),
   (2 , 1050 ,       3 ,    2 ,  85000 , 1410 , 12000),
   (3 ,   20 ,       3 ,    1 ,  22500 , 1060 ,  3500),
   (4 ,  870 ,       2 ,    2 ,  90000 , 1300 , 17500),
   (5 , 1320 ,       3 ,    2 , 133000 , 1500 , 30000),
   (6 , 1350 ,       2 ,    1 ,  90500 ,  820 , 25700),
   (7 , 2790 ,       3 ,  2.5 , 260000 , 2130 , 25000),
   (8 ,  680 ,       2 ,    1 , 142500 , 1170 , 22000),
   (9 , 1840 ,       3 ,    2 , 160000 , 1500 , 19000),
  (10 , 3680 ,       4 ,    2 , 240000 , 2790 , 20000),
  (11 , 1660 ,       3 ,    1 ,  87000 , 1030 , 17500),
  (12 , 1620 ,       3 ,    2 , 118600 , 1250 , 20000),
  (13 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
  (14 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
  (15 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000);
 </pre>
 -#  Train linear classification model and view the model.
 Categorical variable is price < $100,0000.
 <pre class="example">
 DROP TABLE IF EXISTS houses_svm, houses_svm_summary;
 SELECT madlib.svm_classification('houses',
                                  'houses_svm',
                                  'price < 100000',
                                  'ARRAY[1, tax, bath, size]'
                            );
 -- Set extended display on for easier reading of output
 \\x on
 SELECT * FROM houses_svm;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+--------------------------------------------------------------------------------
 coef               | {0.103994021495116,-0.00288252192097756,0.0540748706580464,0.00131729978010033}
 loss               | 0.928463796644648
 norm_of_gradient   | 7849.34910604307
 num_iterations     | 100
 num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | {f,t}
 </pre>
 -# Predict using linear model. We want to predict if house price
 is less than $100,000. We use the training data set for prediction
 as well, which is not usual but serves to show the syntax. The
 predicted results are in the \e prediction column and the actual
 data is in the \e actual column.
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred;
 SELECT madlib.svm_predict('houses_svm',
                           'houses',
                           'id',
                           'houses_pred');
 \\x off
 SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred USING (id) ORDER BY id;
 </pre>
 <pre class="result">
   id | tax  | bedroom | bath | price  | size |  lot  | prediction | decision_function  | actual
 ----+------+---------+------+--------+------+-------+------------+--------------------+--------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |  0.211310440574799 | t
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   0.37546191651855 | t
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          |    2.4021783278516 | t
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |   0.63967342411632 | t
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          | -0.179964783767855 | f
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | f          |  -1.78347623159173 | t
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          |  -2.86795504439645 | f
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | t          |  0.811108105668757 | f
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          |  -1.61739505790168 | f
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          |  -3.96700444824078 | f
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | f          |  -2.19489938920329 | t
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          |  -1.53961627668269 | f
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          |  -4.54881979553637 | f
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          |  -2.06911803381861 | f
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   1.52704061329968 | t
 (15 rows)
 </pre>
 Count the miss-classifications:
 <pre class="example">
 SELECT COUNT(*) FROM houses_pred JOIN houses USING (id)
 WHERE houses_pred.prediction != (houses.price < 100000);
 </pre>
 <pre class="result">
  count
 -------+
      3
 </pre>
 -# Train using Gaussian kernel. This time we specify
 the initial step size and maximum number of iterations to run. As part of the
 kernel parameter, we choose 10 as the dimension of the space where we train
 SVM. As a result, the model will be a 10 dimensional vector, instead
 of 4 as in the case of linear model.
 <pre class="example">
 DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random;
 SELECT madlib.svm_classification( 'houses',
                                   'houses_svm_gaussian',
                                   'price < 100000',
                                   'ARRAY[1, tax, bath, size]',
                                   'gaussian',
                                   'n_components=10',
                                   '',
                                   'init_stepsize=1, max_iter=200'
                            );
 \\x on
 SELECT * FROM houses_svm_gaussian;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 coef               | {-1.67275666209207,1.5191640881642,-0.503066422926727,1.33250956564454,2.23009854231314,-0.0602475029497936,1.97466397155921,2.3668779833279,0.577739846910355,2.81255996089824}
 loss               | 0.0571869097340991
 norm_of_gradient   | 1.18281830047046
 num_iterations     | 177
 num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | {f,t}
 </pre>
 -# Prediction using the Gaussian model.  The predicted results are
 in the \e prediction column and the actual data is in the \e actual column.
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred_gaussian;
 SELECT madlib.svm_predict('houses_svm_gaussian',
                           'houses',
                           'id',
                           'houses_pred_gaussian');
 \\x off
 SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred_gaussian USING (id) ORDER BY id;
 </pre>
 <pre class="result">
  id | tax  | bedroom | bath | price  | size |  lot  | prediction | decision_function  | actual
 ----+------+---------+------+--------+------+-------+------------+--------------------+--------
   1 |  590 |       2 |    1 |  50000 |  770 | 22100 | t          |   1.89855833083557 | t
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000 | t          |   1.47736856649617 | t
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500 | t          |  0.999999992995691 | t
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500 | t          |  0.999999989634351 | t
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000 | f          |  -1.03645694166465 | f
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700 | t          |   1.16430515664766 | t
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000 | f          | -0.545622670134529 | f
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000 | f          |  -1.00000000207512 | f
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000 | f          |   -1.4748622470053 | f
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000 | f          |  -1.00085274698056 | f
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500 | t          |    1.8614251155696 | t
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000 | f          |  -1.77616417509695 | f
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000 | f          |  -1.07759348149549 | f
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000 | f          |  -3.42475835116536 | f
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 | t          |   1.00000008401961 | t
 (15 rows)
 </pre>
 Count the miss-classifications. Note this produces a more accurate
 result than the linear case for this data set:
 <pre class="example">
 SELECT COUNT(*) FROM houses_pred_gaussian JOIN houses USING (id)
 WHERE houses_pred_gaussian.prediction != (houses.price < 100000);
 </pre>
 <pre class="result">
  count
 -------+
      0
 (1 row)
 </pre>
 -# In the case of an unbalanced class-size dataset, use the 'balanced'
 parameter to classify when building the model:
 <pre class="example">
 DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random;
 SELECT madlib.svm_classification( 'houses',
                                   'houses_svm_gaussian',
                                   'price < 150000',
                                   'ARRAY[1, tax, bath, size]',
                                   'gaussian',
                                   'n_components=10',
                                   '',
                                   'init_stepsize=1, max_iter=200, class_weight=balanced'
                            );
 \\x on
 SELECT * FROM houses_svm_gaussian;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 coef               | {0.891926151039837,0.169282494673541,-2.26539133689874,0.526518499596676,-0.900664505989526,0.508112011288015,-0.355474591147659,1.23127975981665,1.53694964239487,1.46496058633682}
 loss               | 0.56900274445785
 norm_of_gradient   | 0.989597662458527
 num_iterations     | 183
 num_rows_processed | 15
 num_rows_skipped   | 0
 dep_var_mapping    | {f,t}
 </pre>

 <h4>Regression</h4>

 -# Create input data set.  For regression we use part of the well
 known abalone data set https://archive.ics.uci.edu/ml/datasets/abalone :
 <pre class="example">
 DROP TABLE IF EXISTS abalone;
 CREATE TABLE abalone (id INT, sex TEXT, length FLOAT, diameter FLOAT, height FLOAT, rings INT);
 INSERT INTO abalone VALUES
 (1,'M',0.455,0.365,0.095,15),
 (2,'M',0.35,0.265,0.09,7),
 (3,'F',0.53,0.42,0.135,9),
 (4,'M',0.44,0.365,0.125,10),
 (5,'I',0.33,0.255,0.08,7),
 (6,'I',0.425,0.3,0.095,8),
 (7,'F',0.53,0.415,0.15,20),
 (8,'F',0.545,0.425,0.125,16),
 (9,'M',0.475,0.37,0.125,9),
 (10,'F',0.55,0.44,0.15,19),
 (11,'F',0.525,0.38,0.14,14),
 (12,'M',0.43,0.35,0.11,10),
 (13,'M',0.49,0.38,0.135,11),
 (14,'F',0.535,0.405,0.145,10),
 (15,'F',0.47,0.355,0.1,10),
 (16,'M',0.5,0.4,0.13,12),
 (17,'I',0.355,0.28,0.085,7),
 (18,'F',0.44,0.34,0.1,10),
 (19,'M',0.365,0.295,0.08,7),
 (20,'M',0.45,0.32,0.1,9);
 </pre>
 -# Train a linear regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_svm_regression, abalone_svm_regression_summary;
 SELECT madlib.svm_regression('abalone',
                              'abalone_svm_regression',
                              'rings',
                              'ARRAY[1, length, diameter, height]'
                            );
 \\x on
 SELECT * FROM abalone_svm_regression;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+-----------------------------------------------------------------------
 coef               | {1.998949892503,0.918517478913099,0.712125856084095,0.229379472956877}
 loss               | 8.29033295818392
 norm_of_gradient   | 23.225177785827
 num_iterations     | 100
 num_rows_processed | 20
 num_rows_skipped   | 0
 dep_var_mapping    | {NULL}
 </pre>
 -# Predict using the linear regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_regr;
 SELECT madlib.svm_predict('abalone_svm_regression',
                           'abalone',
                           'id',
                           'abalone_regr');
 \\x off
 SELECT * FROM abalone JOIN abalone_regr USING (id) ORDER BY id;
 </pre>
 <pre class="result">
  id | sex | length | diameter | height | rings |    prediction    | decision_function
 ----+-----+--------+----------+--------+-------+------------------+-------------------
   1 | M   |  0.455 |    0.365 |  0.095 |    15 | 2.69859240928376 |  2.69859240928376
   2 | M   |   0.35 |    0.265 |   0.09 |     7 | 2.52978857282818 |  2.52978857282818
   3 | F   |   0.53 |     0.42 |  0.135 |     9 | 2.81582333426116 |  2.81582333426116
   4 | M   |   0.44 |    0.365 |  0.125 |    10 | 2.69169603073001 |  2.69169603073001
   5 | I   |   0.33 |    0.255 |   0.08 |     7 | 2.50200316683054 |  2.50200316683054
   6 | I   |  0.425 |      0.3 |  0.095 |     8 | 2.62474869654157 |  2.62474869654157
   7 | F   |   0.53 |    0.415 |   0.15 |    20 | 2.81570339722408 |  2.81570339722408
   8 | F   |  0.545 |    0.425 |  0.125 |    16 | 2.83086793257882 |  2.83086793257882
   9 | M   |  0.475 |     0.37 |  0.125 |     9 | 2.72740477577673 |  2.72740477577673
  10 | F   |   0.55 |     0.44 |   0.15 |    19 |  2.8518768970598 |   2.8518768970598
  11 | F   |  0.525 |     0.38 |   0.14 |    14 | 2.78389260680315 |  2.78389260680315
  12 | M   |   0.43 |     0.35 |   0.11 |    10 | 2.66838827339779 |  2.66838827339779
  13 | M   |   0.49 |     0.38 |  0.135 |    11 | 2.75059759385832 |  2.75059759385832
  14 | F   |  0.535 |    0.405 |  0.145 |    10 | 2.81202782833915 |  2.81202782833915
  15 | F   |   0.47 |    0.355 |    0.1 |    10 | 2.70639581129576 |  2.70639581129576
  16 | M   |    0.5 |      0.4 |   0.13 |    12 | 2.77287839069521 |  2.77287839069521
  17 | I   |  0.355 |     0.28 |  0.085 |     7 | 2.54391615211472 |  2.54391615211472
  18 | F   |   0.44 |     0.34 |    0.1 |    10 | 2.66815839489651 |  2.66815839489651
  19 | M   |  0.365 |    0.295 |   0.08 |     7 | 2.56263631931732 |  2.56263631931732
  20 | M   |   0.45 |     0.32 |    0.1 |     9 | 2.66310105219146 |  2.66310105219146
 (20 rows)
 </pre>
 RMS error:
 <pre class="example">
 SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone
 JOIN abalone_regr USING (id);
 </pre>
 <pre class="result">
     rms_error
 -----------------+
  9.0884271818321
 (1 row)
 </pre>
 -# Train a non-linear regression model using a Gaussian kernel:
 <pre class="example">DROP TABLE IF EXISTS abalone_svm_gaussian_regression, abalone_svm_gaussian_regression_summary, abalone_svm_gaussian_regression_random;
 SELECT madlib.svm_regression( 'abalone',
                               'abalone_svm_gaussian_regression',
                               'rings',
                               'ARRAY[1, length, diameter, height]',
                               'gaussian',
                               'n_components=10',
                               '',
                               'init_stepsize=1, max_iter=200'
                            );
 \\x on
 SELECT * FROM abalone_svm_gaussian_regression;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 coef               | {4.49016341280977,2.19062972461334,-2.04673653356154,1.11216153651262,2.83478599238881,-4.23122821845785,4.17684533744501,-5.36892552740644,0.775782561685621,-3.62606941016707}
 loss               | 2.66850539541894
 norm_of_gradient   | 0.97440079536379
 num_iterations     | 163
 num_rows_processed | 20
 num_rows_skipped   | 0
 dep_var_mapping    | {NULL}
 </pre>
 -# Predict using Gaussian regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_gaussian_regr;
 SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
                           'abalone',
                           'id',
                           'abalone_gaussian_regr');
 \\x off
 SELECT * FROM abalone JOIN abalone_gaussian_regr USING (id) ORDER BY id;
 </pre>
 <pre class="result">
  id | sex | length | diameter | height | rings |    prediction    | decision_function
 ----+-----+--------+----------+--------+-------+------------------+-------------------
   1 | M   |  0.455 |    0.365 |  0.095 |    15 | 9.92189555675422 |  9.92189555675422
   2 | M   |   0.35 |    0.265 |   0.09 |     7 | 9.81553107620013 |  9.81553107620013
   3 | F   |   0.53 |     0.42 |  0.135 |     9 | 10.0847384862759 |  10.0847384862759
   4 | M   |   0.44 |    0.365 |  0.125 |    10 | 10.0100000075406 |  10.0100000075406
   5 | I   |   0.33 |    0.255 |   0.08 |     7 | 9.74093262454458 |  9.74093262454458
   6 | I   |  0.425 |      0.3 |  0.095 |     8 | 9.94807651709641 |  9.94807651709641
   7 | F   |   0.53 |    0.415 |   0.15 |    20 | 10.1448936105369 |  10.1448936105369
   8 | F   |  0.545 |    0.425 |  0.125 |    16 | 10.0579420659954 |  10.0579420659954
   9 | M   |  0.475 |     0.37 |  0.125 |     9 |  10.055724626407 |   10.055724626407
  10 | F   |   0.55 |     0.44 |   0.15 |    19 | 10.1225030222559 |  10.1225030222559
  11 | F   |  0.525 |     0.38 |   0.14 |    14 |  10.160706707435 |   10.160706707435
  12 | M   |   0.43 |     0.35 |   0.11 |    10 | 9.95760174386841 |  9.95760174386841
  13 | M   |   0.49 |     0.38 |  0.135 |    11 | 10.0981242315617 |  10.0981242315617
  14 | F   |  0.535 |    0.405 |  0.145 |    10 | 10.1501121415596 |  10.1501121415596
  15 | F   |   0.47 |    0.355 |    0.1 |    10 | 9.97689437628973 |  9.97689437628973
  16 | M   |    0.5 |      0.4 |   0.13 |    12 | 10.0633271219326 |  10.0633271219326
  17 | I   |  0.355 |     0.28 |  0.085 |     7 | 9.79492924255328 |  9.79492924255328
  18 | F   |   0.44 |     0.34 |    0.1 |    10 | 9.94856833428783 |  9.94856833428783
  19 | M   |  0.365 |    0.295 |   0.08 |     7 | 9.78278863173308 |  9.78278863173308
  20 | M   |   0.45 |     0.32 |    0.1 |     9 | 9.98822477687532 |  9.98822477687532
 (20 rows)
 </pre>
 Compute the RMS error. Note this produces a more accurate result than
 the linear case for this data set:
 <pre class="example">
 SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone
 JOIN abalone_gaussian_regr USING (id);
 </pre>
 <pre class="result">
     rms_error
 ------------------+
  3.83678516581768
 (1 row)
 </pre>
 -# Cross validation.  Let's run cross validation for different initial step sizes and lambda values:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_svm_gaussian_regression, abalone_svm_gaussian_regression_summary,
 abalone_svm_gaussian_regression_random, abalone_svm_gaussian_regression_cv;
 SELECT madlib.svm_regression( 'abalone',
                               'abalone_svm_gaussian_regression',
                               'rings',
                               'ARRAY[1, length, diameter, height]',
                               'gaussian',
                               'n_components=10',
                               '',
                               'init_stepsize=[0.01,1], n_folds=3, max_iter=100, lambda=[0.01, 0.1, 0.5],
                               validation_result=abalone_svm_gaussian_regression_cv'
                            );
 \\x on
 SELECT * FROM abalone_svm_gaussian_regression;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 coef               | {4.78843909576373,1.82443514279869,-2.36814051329038,1.48570658714655,1.77828407859486,-6.38446590203159,3.99631488328669,-6.52441625480202,3.82342434407222,-3.67579802967837}
 loss               | 2.5002301320973
 norm_of_gradient   | 0.692031063542266
 num_iterations     | 100
 num_rows_processed | 20
 num_rows_skipped   | 0
 dep_var_mapping    | {NULL}
 </pre>
 View the summary table showing the final model parameters are those that produced
 the lowest error in the cross validation runs:
 <pre class="example">
 SELECT * FROM abalone_svm_gaussian_regression_summary;
 </pre>
 <pre class="result">
 -[ RECORD 1 ]--------+------------------------------------------------------------------------------------
 method               | SVR
 version_number       | 1.22.0-dev
 source_table         | abalone
 model_table          | abalone_svm_gaussian_regression
 dependent_varname    | rings
 independent_varname  | ARRAY[1, length, diameter, height]
 kernel_func          | gaussian
 kernel_params        | gamma=0.25, n_components=10,random_state=1, fit_intercept=False, fit_in_memory=True
 grouping_col         | NULL
 optim_params         | init_stepsize=1.0,
                      | decay_factor=0.9,
                      | max_iter=100,
                      | tolerance=1e-10,
                      | epsilon=0.01,
                      | eps_table=,
                      | class_weight=
 reg_params           | lambda=0.01, norm=l2, n_folds=3
 num_all_groups       | 1
 num_failed_groups    | 0
 total_rows_processed | 20
 total_rows_skipped   | 0

 (6 rows)
 </pre>
 View the statistics for the various cross validation values:
 <pre class="example">
 \\x off
 SELECT * FROM abalone_svm_gaussian_regression_cv;
 </pre>
 <pre class="result">
  init_stepsize | lambda |   mean_score   | std_dev_score
 ---------------+--------+----------------+---------------
           0.01 |    0.5 | -10.3718064203 | 1.70034854924
            1.0 |    0.5 | -4.03598164681 | 2.03645607616
           0.01 |   0.01 | -10.3424727426 | 1.70103511744
           0.01 |    0.1 | -10.3479312204 | 1.70090715482
            1.0 |   0.01 | -3.59999345219 | 1.80351978881
            1.0 |    0.1 | -3.74710586166 | 1.91455355654
 (6 rows)
 </pre>
 -# Predict using the cross-validated Gaussian regression model:
 <pre class="example">
 DROP TABLE IF EXISTS abalone_gaussian_regr;
 SELECT madlib.svm_predict('abalone_svm_gaussian_regression',
                           'abalone',
                           'id',
                           'abalone_gaussian_regr');
 </pre>
 Compute the RMS error. Note this produces a more accurate result than
 the previous run with the Gaussian kernel:
 <pre class="example">
 SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone
 JOIN abalone_gaussian_regr USING (id);
 </pre>
 <pre class="result">
     rms_error
 ------------------+
  3.64653492872019
 (1 row)
 </pre>

 <h4>Novelty Detection</h4>

 -# Now train a non-linear one-class SVM for novelty detection, using a Gaussian kernel.
 Note that the dependent variable is not a parameter for one-class:
 <pre class="example">
 DROP TABLE IF EXISTS houses_one_class_gaussian, houses_one_class_gaussian_summary, houses_one_class_gaussian_random;
 SELECT madlib.svm_one_class('houses',
                             'houses_one_class_gaussian',
                             'ARRAY[1,tax,bedroom,bath,size,lot,price]',
                             'gaussian',
                             'gamma=0.5,n_components=55, random_state=3',
                             NULL,
                             'max_iter=100, init_stepsize=10,lambda=10, tolerance=0'
                             );
 \\x on
 SELECT * FROM houses_one_class_gaussian;
 </pre>
 Result:
 <pre class="result">
 -[ RECORD 1 ]------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 coef               | {redacted for brevity}
 loss               | 0.944016313708205
 norm_of_gradient   | 14.5271059047443
 num_iterations     | 100
 num_rows_processed | 16
 num_rows_skipped   | -1
 dep_var_mapping    | {-1,1}
 </pre>
 -#  For the novelty detection using one-class, let's create a test data
 set using the last 3 values from the training set plus an outlier at
 the end (10x price):
 <pre class="example">
 DROP TABLE IF EXISTS houses_one_class_test;
 CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
             size INT, lot INT);
 INSERT INTO houses_one_class_test VALUES
  (1 , 3100 ,       3 ,    2 , 140000 , 1760 , 38000),
  (2 , 2070 ,       2 ,    3 , 148000 , 1550 , 14000),
  (3 ,  650 ,       3 ,  1.5 ,  65000 , 1450 , 12000),
  (4 ,  650 ,       3 ,  1.5 ,  650000 , 1450 , 12000);
 </pre>
 Now run prediction on the Gaussian one-class novelty detection model.
 Result shows the last row predicted to be novel:
 <pre class="example">
 DROP TABLE IF EXISTS houses_pred;
 SELECT madlib.svm_predict('houses_one_class_gaussian',
                           'houses_one_class_test',
                           'id',
                           'houses_pred');
 \\x off
 SELECT * FROM houses_one_class_test JOIN houses_pred USING (id) ORDER BY id;
 </pre>
 Result showing the last row predicted to be novel:
 <pre class="result">
  id | tax  | bedroom | bath | price  | size |  lot  | prediction |  decision_function
 ----+------+---------+------+--------+------+-------+------------+---------------------
   1 | 3100 |       3 |    2 | 140000 | 1760 | 38000 |          1 |   0.111497008121437
   2 | 2070 |       2 |    3 | 148000 | 1550 | 14000 |          1 |  0.0996021345169148
   3 |  650 |       3 |  1.5 |  65000 | 1450 | 12000 |          1 |  0.0435064008756942
   4 |  650 |       3 |  1.5 | 650000 | 1450 | 12000 |         -1 | -0.0168967845338403
 </pre>

 @anchor background
 @par Technical Background

 To solve linear SVM, the following objective function is minimized:
 \f[
     \underset{w,b}{\text{Minimize }} \lambda||w||^2 + \frac{1}{n}\sum_{i=1}^n \ell(y_i,f_{w,b}(x_i))

 \f]

  where \f$(x_1,y_1),\ldots,(x_n,y_n)\f$ are labeled training data and
  \f$\ell(y,f(x))\f$ is a loss function. When performing classification,
  \f$\ell(y,f(x)) = \max(0,1-yf(x))\f$ is the <em>hinge loss</em>.
  For regression, the loss function \f$\ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon)\f$
  is used.

  If \f$ f_{w,b}(x) = \langle w, x\rangle + b\f$ is linear, then the
  objective function is convex and incremental gradient descent (IGD, or SGD)
  can be applied to find a global minimum. See Feng, et al. [1] for more details.

 To learn with Gaussian or polynomial kernels, the training data is first mapped
 via a <em>random feature map</em> in such a way that the usual inner product in
 the feature space approximates the kernel function in the input space. The
 linear SVM training function is then run on the resulting data. See the papers
 [2,3] for more information on random feature maps.

 Also, see the book [4] by Scholkopf and Smola  for more details on SVMs in general.

 @anchor literature
 @literature

 @anchor svm-lit-1
 [1] Xixuan Feng, Arun Kumar, Ben Recht, and Christopher Re:
     Towards a Unified Architecture for in-RDBMS analytics,
     in SIGMOD Conference, 2012
     http://www.eecs.berkeley.edu/~brecht/papers/12.FengEtAl.SIGMOD.pdf

 @anchor svm-lit-2
 [2] Purushottam Kar and Harish Karnick: Random Feature Maps for Dot
     Product Kernels, Proceedings of the 15th International Conference
     on Artificial Intelligence and Statistics, 2012,
     http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2012_KarK12.pdf

 @anchor svm-lit-3
 [3] Ali Rahmini and Ben Recht: Random Features for Large-Scale
 Kernel Machines, Neural Information Processing Systems 2007,
     http://www.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf

 @anchor svm-lit-4
 [4] Bernhard Scholkopf and Alexander Smola: Learning with Kernels,
     The MIT Press, Cambridge, MA, 2002.

 @anchor svm-lit-5
 [5] Vladimir Cherkassky and Yunqian Ma: Practical Selection of SVM Parameters
     and Noise Estimation for SVM Regression, Neural Networks, 2004
     http://www.ece.umn.edu/users/cherkass/N2002-SI-SVM-13-whole.pdf

 @anchor related
 @par Related Topics

 File svm.sql_in documenting the training function

 @internal
 @sa Namespace SVM (documenting the driver/outer loop implemented in
     Python), Namespace
     \ref madlib::modules::regress documenting the implementation in C++
 @endinternal
 */


 DROP TYPE IF EXISTS MADLIB_SCHEMA.linear_svm_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.linear_svm_result AS (
         coefficients        double precision[],
         loss                double precision,
         norm_of_gradient    double precision,
         num_rows_processed  bigint
 );

 --------------------------------------------------------------------------
 -- create SQL functions for IGD optimizer
 --------------------------------------------------------------------------
 -- cannot be labeled as STRICT because we set previous_state NULL initially
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_transition(
         state           double precision[],
         ind_var         double precision[],
         dep_var         double precision,
         previous_state  double precision[],
         dimension       integer,
         stepsize        double precision,
         reg             double precision,
         is_l2           boolean,
         n_tuples        integer,
         epsilon         double precision,
         is_svc          boolean,
         tuple_weight    double precision
 )
 RETURNS double precision[] AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_merge(
         state1 double precision[],
         state2 double precision[])
 RETURNS double precision[] AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_final(
         state double precision[])
 RETURNS double precision[] AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');

 /**
  * @internal
  * @brief Perform one iteration of the incremental gradient
  *        method for computing linear support vector machine
  */
 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_step(
         /*+ ind_var */          double precision[],
         /*+ dep_var */          double precision,
         /*+ previous_state */   double precision[],
         /*+ dimension */        integer,
         /*+ stepsize */         double precision,
         /*+ reg */              double precision,
         /*+ is_l2 */            boolean,
         /*+ n_tuples */         integer,
         /*+ epsilon */          double precision,
         /*+ is_svc */           boolean,
         /*+ tuple_weight */     double precision
 );
 CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step(
         /*+ ind_var */          double precision[],
         /*+ dep_var */          double precision,
         /*+ previous_state */   double precision[],
         /*+ dimension */        integer,
         /*+ stepsize */         double precision,
         /*+ reg */              double precision,
         /*+ is_l2 */            boolean,
         /*+ n_tuples */         integer,
         /*+ epsilon */          double precision,
         /*+ is_svc */           boolean,
         /*+ tuple_weight */     double precision
     ) (
     STYPE=double precision[],
     SFUNC=MADLIB_SCHEMA.linear_svm_igd_transition,
     m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_merge,')
     FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_final,
     INITCOND='{0,0,0,0,0,0,0}'
 );

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_distance(
     /*+ state1 */ double precision[],
     /*+ state2 */ double precision[])
 RETURNS double precision AS 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_result(
     /*+ state */ double precision[])
 RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text,
     verbose bool)
 RETURNS void AS $$
     # indent according to PythonFunction
     global is_svc
     is_svc = False
     PythonFunction(svm, svm, svm)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 -----------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text,
     verbose bool)
 RETURNS void AS $$
     # indent according to PythonFunction
     global is_svc
     is_svc = True
     PythonFunction(svm, svm, svm)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 -- all default value handling implemented in Python
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text,
     kernel_params text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text,
     kernel_func text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification(
     source_table text,
     model_table text,
     dependent_varname text,
     independent_varname text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 -----------------------------------------------------------------


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text,
     verbose bool)
 RETURNS void AS $$
     PythonFunction(svm, svm, svm_one_class)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 -- all default value handling implemented in Python
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text,
     params text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, $7, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text,
     kernel_func text,
     kernel_params text,
     grouping_col text)
 RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, ''::text, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text,
     kernel_func text,
     kernel_params text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, ''::text, ''::text, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text,
     kernel_func text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, ''::text, ''::text, ''::text, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class(
     source_table text,
     model_table text,
     independent_varname text)
   RETURNS void AS $$
     SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, ''::text,
                                        ''::text, ''::text, ''::text, FALSE);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');


 ------ Prediction -------------------------------------------------------------
 /**
  * @brief Scores the data points stored in a table using a learned linear support-vector model
  * @param model_table Name of table where the learned model to be used is stored
  * @param new_data_table Name of table/view containing the data points to be scored
  * @param id_col Name of column in new_data_table containing the integer identifier of data points
  *
  *
  *
  * @param output_table Name of table to store the results
  *
  * @return Textual summary of the algorithm run
  *
  * @internal
  * @sa This function is a wrapper for svm.svm_predict().
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
     model_table text,
     new_data_table text,
     id_col_name text,
     output_table text)
 RETURNS void AS $$
 PythonFunction(svm, svm, svm_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');

 -- Online Help -----------------------------------------------------------

 /**
  * @brief Help function
  */

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
    message TEXT
 ) RETURNS TEXT AS $$
 PythonFunction(svm, svm, svm_predict_help)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict()
 RETURNS TEXT AS $$
   SELECT MADLIB_SCHEMA.svm_predict(NULL::TEXT)
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict(
    message text)
 RETURNS TEXT AS $$
 PythonFunction(svm, svm, svm_predict_help)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification (
     message  TEXT
 ) RETURNS TEXT AS $$
     PythonFunctionBodyOnly(svm, svm)
     return svm.svm_help(schema_madlib, message, True)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification ()
 RETURNS TEXT AS $$
   SELECT MADLIB_SCHEMA.svm_classification(NULL::TEXT)
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class (
     message  TEXT
 ) RETURNS TEXT AS $$
     PythonFunctionBodyOnly(svm, svm)
     return svm.svm_one_class_help(schema_madlib, message, True)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class ()
 RETURNS TEXT AS $$
   SELECT MADLIB_SCHEMA.svm_one_class(NULL::TEXT)
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression (
     message  TEXT
 ) RETURNS TEXT AS $$
     PythonFunctionBodyOnly(svm, svm)
     return svm.svm_help(schema_madlib, message, False)
 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression ()
 RETURNS TEXT AS $$
   SELECT MADLIB_SCHEMA.svm_regression(''::TEXT)
 $$ LANGUAGE SQL IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');