| /* ----------------------------------------------------------------------- */ |
| /** |
| * |
| * @file svm.sql_in |
| * |
| * @brief SQL functions for SVM (Poisson) |
| * @date July 2015 |
| * |
| * @sa For a brief introduction to SVM (Poisson), see the |
| * module description \ref grp_svm. |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_svm |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#svm_classification">Classification Function</a></li> |
| <li class="level1"><a href="#svm_regression">Regression Function</a></li> |
| <li class="level1"><a href="#novelty_detection">Novelty Detection</a></li> |
| <li class="level1"><a href="#kernel_params">Kernel Parameters</a></li> |
| <li class="level1"><a href="#parameters">Other Parameters</a></li> |
| <li class="level1"><a href="#predict">Prediction Functions</a></li> |
| <li class="level1"><a href="#example">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| Support vector machines are models for regression and classification |
| tasks. SVM models have two particularly desirable features: robustness in the |
| presence of noisy data and applicability to a variety of data configurations. At |
| its core, a <em>linear</em> SVM model is a hyperplane separating two |
| distinct classes of data (in the case of classification problems), in such a way |
| that the distance between the hyperplane and the nearest training data point |
| (called the <em>margin</em>) is maximized. Vectors that lie on this margin are |
| called support vectors. With the support vectors fixed, perturbations of vectors |
| beyond the margin will not affect the model; this contributes to the model’s |
| robustness. By substituting a kernel function for the usual inner product, one can |
| approximate a large variety of decision boundaries in addition to linear hyperplanes. |
| @brief Solves classification and regression problems by separating data with |
| a hyperplane or other nonlinear decision boundary. |
| |
| @anchor svm_classification |
| @par Classification Training Function |
| The SVM binary classification training function has the following format: |
| <pre class="syntax"> |
| svm_classification( |
| source_table, |
| model_table, |
| dependent_varname, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. Name of the table containing the training data.</DD> |
| |
| <DT>model_table</DT> |
| <DD>TEXT. Name of the output table containing the model. Details of the output |
| tables are provided below. |
| </DD> |
| |
| <DT>dependent_varname</DT> |
| <DD> TEXT. Name of the dependent variable column. For classification, this |
| column can contain values of any type, but must assume exactly two distinct |
| values since only binary classification is currently supported. |
| </DD> |
| |
| <DT>independent_varname</DT> |
| <DD>TEXT. Expression list to evaluate for the |
| independent variables. An intercept variable should not be included as part |
| of this expression. See 'fit_intercept' in the kernel params for info on |
| intercepts. Please note that expression should be able to be cast |
| to DOUBLE PRECISION[]. |
| |
| <DT>kernel_func (optional)</DT> |
| <DD>TEXT, default: 'linear'. |
| Type of kernel. Currently three kernel types are supported: 'linear', |
| 'gaussian', and 'polynomial'. The text can be any subset of the three |
| strings; for e.g., kernel_func='ga' will create a Gaussian kernel. |
| </DD> |
| |
| <DT>kernel_params (optional)</DT> |
| <DD>TEXT, defaults: NULL. |
| Parameters for non-linear kernel in a comma-separated string of key-value pairs. |
| The actual parameters differ depending on the value of \e kernel_func. |
| See the description below for details. |
| </DD> |
| |
| <DT>grouping_col (optional)</DT> |
| <DD>TEXT, default: NULL. An expression list used to group |
| the input dataset into discrete groups, which results in running one model per group. |
| Similar to the SQL "GROUP BY" clause. When this value is NULL, no |
| grouping is used and a single model is generated. Please note that |
| cross validation is not supported if grouping is used.</DD> |
| |
| <DT>params (optional)</DT> |
| <DD>TEXT, default: NULL. |
| Parameters for optimization and regularization in a comma-separated string |
| of key-value pairs. If a list of values is provided, then cross-validation |
| will be performed to select the \e best value from the list. See the |
| description below for details. |
| </DD> |
| |
| <DT>verbose (optional)</DT> |
| <DD>BOOLEAN default: FALSE. |
| Verbose output of the results of training.</DD> |
| </DL> |
| |
| <b>Output tables</b> |
| <br> |
| The model table produced by SVM contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>coef</th> |
| <td>FLOAT8. Vector of coefficients.</td> |
| </tr> |
| <tr> |
| <th>grouping_key</th> |
| <td>TEXT Identifies the group to which the datum belongs.</td> |
| </tr> |
| <tr> |
| <th>num_rows_processed</th> |
| <td>BIGINT. Numbers of rows processed.</td> |
| </tr> |
| <tr> |
| <th>num_rows_skipped</th> |
| <td>BIGINT. Numbers of rows skipped due to missing values or failures.</td> |
| </tr> |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. Number of iterations completed by stochastic gradient descent |
| algorithm. The algorithm either converged in this number of iterations |
| or hit the maximum number specified in the optimization parameters. </td> |
| </tr> |
| <tr> |
| <th>loss</th> |
| <td>FLOAT8. Value of the objective function of SVM, expressed as an |
| average loss per row over the \e source_table. See Technical Background |
| section below for more details.</td> |
| </tr> |
| <tr> |
| <th>norm_of_gradient</th> |
| <td>FLOAT8. Value of the L2-norm of the (sub)-gradient of the objective function.</td> |
| </tr> |
| <tr> |
| <th>__dep_var_mapping</th> |
| <td>TEXT[]. Vector of dependent variable labels. The first entry |
| corresponds to -1 and the second to +1. For internal use only.</td> |
| </tr> |
| </table> |
| |
| An auxiliary table named \<model_table\>_random is created if the kernel is |
| not linear. It contains data needed to embed test data into a random feature |
| space (see references [2,3]). This data is used internally by svm_predict |
| and not meaningful on its own to the user, so you can ignore it. |
| |
| A summary table named \<model_table\>_summary is also created, which has the following columns: |
| <table class="output"> |
| <tr> |
| <th>method</th> |
| <td>'svm'</td> |
| </tr> |
| <tr> |
| <th>version_number</th> |
| <td>Version of MADlib which was used to generate the model.</td> |
| </tr> |
| <tr> |
| <th>source_table</th> |
| <td>The data source table name.</td> |
| </tr> |
| <tr> |
| <th>model_table</th> |
| <td>The model table name.</td> |
| </tr> |
| <tr> |
| <th>dependent_varname</th> |
| <td>The dependent variable.</td> |
| </tr> |
| <tr> |
| <th>independent_varname</th> |
| <td>The independent variables.</td> |
| </tr> |
| <tr> |
| <th>kernel_func</th> |
| <td>The kernel function.</td> |
| </tr> |
| <tr> |
| <th>kernel_parameters</th> |
| <td>The kernel parameters, as well as random feature map data.</td> |
| </tr> |
| <tr> |
| <th>grouping_col</th> |
| <td>Columns on which to group.</td> |
| </tr> |
| <tr> |
| <th>optim_params</th> |
| <td>A string containing the optimization parameters.</td> |
| </tr> |
| <tr> |
| <th>reg_params</th> |
| <td>A string containing the regularization parameters.</td> |
| </tr> |
| <tr> |
| <th>num_all_groups</th> |
| <td>Number of groups in SVM training.</td> |
| </tr> |
| <tr> |
| <th>num_failed_groups</th> |
| <td>Number of failed groups in SVM training.</td> |
| </tr> |
| <tr> |
| <th>total_rows_processed</th> |
| <td>Total numbers of rows processed in all groups.</td> |
| </tr> |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>Total numbers of rows skipped in all groups due to missing |
| values or failures.</td> |
| </tr> |
| </table> |
| |
| If cross validation is used, a table is created with a |
| user specified name having the following columns: |
| <table class="output"> |
| <tr> |
| <th>...</th> |
| <td>Names of cross validation parameters</td> |
| </tr> |
| <tr> |
| <th>mean_score</th> |
| <td>Mean value of accuracy when predicted on the |
| validation fold, averaged over all folds and all rows.</td> |
| </tr> |
| <tr> |
| <th>std_dev_score</th> |
| <td>Standard deviation of accuracy when predicted on the |
| validation fold, averaged over all folds and all rows.</td> |
| </tr> |
| </table> |
| |
| @anchor svm_regression |
| @par Regression Training Function |
| The SVM regression training function has the following format: |
| <pre class="syntax"> |
| svm_regression(source_table, |
| model_table, |
| dependent_varname, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| |
| \b Arguments |
| |
| Specifications for regression are largely the same as for classification. In the |
| model table, there is no dependent variable mapping. The following |
| arguments have specifications which differ from svm_classification: |
| <DL class="arglist"> |
| <DT>dependent_varname</DT> |
| <DD>TEXT. Name of the dependent variable column. For regression, this column |
| can contain only values or expressions that can be cast to DOUBLE PRECISION. |
| Otherwise, an error will be thrown. |
| </DD> |
| <DT>params (optional)</DT> |
| <DD>TEXT, default: NULL. |
| The parameters \e epsilon and \e eps_table are only meaningful for regression. |
| See description below for more details. |
| </DD> |
| </DL> |
| |
| @anchor novelty_detection |
| @par Novelty Detection Training Function |
| The novelty detection function is a one-class SVM classifier, and has the following format: |
| <pre class="syntax"> |
| svm_one_class( |
| source_table, |
| model_table, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| \b Arguments |
| |
| Specifications for novelty detection are largely the same as for classification, |
| except the dependent variable name is not specified. The model table is the same |
| as that for classification. |
| |
| @anchor kernel_params |
| @par Kernel Parameters |
| Kernel parameters are supplied in a string containing a comma-delimited |
| list of name-value pairs. All of these named parameters are optional, and |
| their order does not matter. You must use the format "<param_name> = <value>" |
| to specify the value of a parameter, otherwise the parameter is ignored. |
| |
| <DL class="arglist"> |
| <DT><i>Parameters common to all kernels</i></dt><dd></dd> |
| <DT>fit_intercept</dt> |
| <DD>Default: True. The parameter \e fit_intercept is an indicator to add an |
| intercept to the \e independent_varname array expression. The intercept is added |
| to the end of the feature list - thus the last element of the coefficient list |
| is the intercept. |
| </DD> |
| <DT>n_components</DT> |
| <DD>Default: max(100, 2*num_features). The dimensionality of the transformed feature space. |
| A larger value lowers the variance of the estimate of the kernel but requires |
| more memory and takes longer to train.</DD> |
| @note |
| Setting the \e n_components kernel parameter properly is important to |
| generate an accurate decision boundary and can make the difference between a |
| good model and a useless model. Try increasing the value of \e n_components |
| if you are not getting an accurate decision boundary. This parameter arises |
| from using the primal formulation, in which we map data into a relatively |
| low-dimensional randomized feature space [2, 3]. The parameter |
| \e n_components is the dimension of that feature space. We use the primal in |
| MADlib to support scaling to large data sets, compared to R or other single |
| node implementations that use the dual formulation and hence do not have this |
| type of mapping, since the the dimensionality of the transformed feature |
| space in the dual is effectively infinite. |
| |
| <DT>random_state</DT> |
| <DD>Default: 1. Seed used by a random number generator. </DD> |
| </DL> |
| |
| <DL class="arglist"> |
| <DT><i>Parameters for 'gaussian' kernel</i></dt><dd></dd> |
| <DT>gamma</dt> |
| <DD> Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function |
| kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma |
| is critical to the performance of kernel machine; e.g., while a large \e gamma |
| tends to cause overfitting, a small \e gamma will make the model too constrained |
| to capture the complexity of the data. |
| </DD> |
| </DL> |
| |
| <DL class="arglist"> |
| <DT><i>Parameters for 'polynomial' kernel</i></dt><dd></dd> |
| <DT>coef0</dt> |
| <DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$. |
| Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form. |
| </DD> |
| <DT>degree</dt> |
| <DD>Default: 3. The parameter \f$r\f$ in \f$ (\langle x,y\rangle + q)^r \f$. |
| </DD> |
| </DL> |
| |
| |
| @anchor parameters |
| @par Other Parameters |
| Parameters in this section are supplied in the \e params argument as a string |
| containing a comma-delimited list of name-value pairs. All of these named |
| parameters are optional, and their order does not matter. You must use the |
| format "<param_name> = <value>" to specify the value of a parameter, otherwise |
| the parameter is ignored. |
| |
| Hyperparameter optimization can be carried out using the built-in cross |
| validation mechanism, which is activated by assigning a value greater than 1 to |
| the parameter \e n_folds in \e params. |
| Please note that cross validation is not |
| supported if grouping is used. |
| |
| The values of a parameter to cross validate should be provided in a list. For |
| example, if one wanted to regularize with the L1 norm and use a lambda value |
| from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, norm=L1, |
| n_folds=10' in \e params. Note that the use of '{}' and '[]' are both valid |
| here. |
| @note |
| Note that not all of the parameters below can be cross-validated. For |
| parameters where cross validation is allowed, their default values are presented |
| in list format; e.g., [0.01]. |
| |
| <pre class="syntax"> |
| 'init_stepsize = <value>, |
| decay_factor = <value>, |
| max_iter = <value>, |
| tolerance = <value>, |
| lambda = <value>, |
| norm = <value>, |
| epsilon = <value>, |
| eps_table = <value>, |
| validation_result = <value>, |
| n_folds = <value>, |
| class_weight = <value>' |
| </pre> |
| \b Parameters |
| <DL class="arglist"> |
| |
| <DT>init_stepsize</dt> |
| <DD>Default: [0.01]. |
| Also known as the initial learning rate. A small value is usually desirable to |
| ensure convergence, while a large value provides more room for progress during |
| training. Since the best value depends on the condition number of the data, in |
| practice one often searches in an exponential grid using built-in cross |
| validation; e.g., "init_stepsize = [1, 0.1, 0.001]". To reduce training time, it |
| is common to run cross validation on a subsampled dataset, since this usually |
| provides a good estimate of the condition number of the whole dataset. Then the |
| resulting \e init_stepsize can be run on the whole dataset. |
| |
| </DD> |
| |
| <DT>decay_factor</DT> |
| <DD>Default: [0.9]. Control the learning rate schedule: 0 means constant rate; |
| <-1 means inverse scaling, i.e., stepsize = init_stepsize / iteration; > 0 means |
| <exponential decay, i.e., stepsize = init_stepsize * decay_factor^iteration. |
| </DD> |
| |
| <DT>max_iter</dt> |
| <DD>Default: [100]. The maximum number of iterations allowed. |
| </DD> |
| |
| <DT>tolerance</dt> |
| <DD>Default: 1e-10. The criterion to end iterations. The training stops whenever |
| the difference between the training models of two consecutive iterations is |
| smaller than \e tolerance or the iteration number is larger than \e max_iter. |
| </DD> |
| |
| <DT>lambda</dt> |
| <DD>Default: [0.01]. Regularization parameter. Must be non-negative. |
| </DD> |
| |
| <DT>norm</dt> |
| <DD>Default: 'L2'. Name of the regularization, either 'L2' or 'L1'. |
| </DD> |
| |
| <DT>epsilon</dt> |
| <DD>Default: [0.01]. |
| Determines the \f$\epsilon\f$ for \f$\epsilon\f$-regression. Ignored during classification. |
| When training the model, differences of less than \f$\epsilon\f$ between estimated labels |
| and actual labels are ignored. A larger \f$\epsilon\f$ will yield a model |
| with fewer support vectors, but will not generalize as well to future data. |
| Generally, it has been suggested that epsilon should increase with noisier |
| data, and decrease with the number of samples. See [5]. |
| </DD> |
| |
| <DT>eps_table</dt> |
| <DD>Default: NULL. |
| Name of the input table that contains values of epsilon for different groups. |
| Ignored when \e grouping_col is NULL. Define this input table if you want |
| different epsilon values for different groups. The table consists of a column |
| named \e epsilon which specifies the epsilon values, and one or more columns for |
| \e grouping_col. Extra groups are ignored, and groups not present in this table |
| will use the epsilon value specified in parameter \e epsilon. |
| </DD> |
| |
| <DT>validation_result</dt> |
| <DD>Default: NULL. |
| Name of the table to store the cross validation scores. |
| This table is only created if the name is not NULL. |
| The cross validation scores are the mean and standard deviation |
| of the accuracy when predicted on the validation fold, |
| averaged over all folds and all rows. For classification, the accuracy |
| metric used is the ratio of correct classifications. For regression, the |
| accuracy metric used is the negative of mean squared error (negative to |
| make it a concave problem, thus selecting \e max means the highest accuracy). |
| </DD> |
| |
| <DT>n_folds</dt> |
| <DD>Default: 0. |
| Number of folds (k). Must be at least 2 to activate cross validation. |
| If a value of k > 2 is specified, each fold is then used as a validation set once, |
| while the other k - 1 folds form the training set. |
| </DD> |
| |
| <DT>class_weight</dt> |
| <DD>Default: NULL for classification, 'balanced' for one-class novelty detection, |
| this param is not applicable for regression. |
| |
| Set the weight for the classes. If not given (empty/NULL), all classes are set to have |
| equal weight. If 'class_weight = balanced', values of y are automatically adjusted |
| as inversely proportional to class frequencies in the input data i.e. the weights |
| are set as n_samples / (2 * bincount(y)). |
| |
| Alternatively, 'class_weight' can be a mapping, giving the weight for each class. |
| E.g., for dependent variable values 'a' and 'b', the 'class_weight' might be |
| {a: 1, b: 3}. This gives three times the weight to observations with class value |
| 'b' compared to 'a'. (In the SVM algorithm, this translates into observations |
| with class value 'b' contributing 3x to learning in the stochastic gradient step |
| compared to 'a'.) |
| |
| For regression, the class weights are always one. |
| </DD> |
| </DL> |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function is used to estimate the conditional mean given a new |
| predictor. The same syntax is used for classification, regression and novelty |
| detection: |
| <pre class="syntax"> |
| svm_predict(model_table, |
| new_data_table, |
| id_col_name, |
| output_table) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>model_table</DT> |
| <DD>TEXT. Model table produced by the training function.</DD> |
| |
| <DT>new_data_table</DT> |
| <DD>TEXT. Name of the table containing the prediction data. This table is expected |
| to contain the same features that were used during training. The table should |
| also contain id_col_name used for identifying each row.</DD> |
| |
| <DT>id_col_name</DT> |
| <DD>TEXT. The name of the id column in the input table.</DD> |
| |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table where output predictions are written. If this |
| table name is already in use, then an error is returned. Table contains:</DD> |
| <table class="output"> |
| <tr> |
| <th>id</th> |
| <td>Gives the 'id' for each prediction, corresponding to each row from the new_data_table.</td> |
| </tr> |
| <tr> |
| <th>prediction</th> |
| <td>Provides the prediction for each row in new_data_table. |
| For regression this would be the same as decision_function. For classification, |
| this will be one of the dependent variable values.</td> |
| </tr> |
| <tr> |
| <th>decision_function</th> |
| <td>Provides the distance between each point and the separating hyperplane.</td> |
| </tr> |
| </DL> |
| </table> |
| @anchor example |
| @par Examples |
| |
| <h4>Classification</h4> |
| |
| -# Create an input data set. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses; |
| CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| INSERT INTO houses VALUES |
| (1 , 590 , 2 , 1 , 50000 , 770 , 22100), |
| (2 , 1050 , 3 , 2 , 85000 , 1410 , 12000), |
| (3 , 20 , 3 , 1 , 22500 , 1060 , 3500), |
| (4 , 870 , 2 , 2 , 90000 , 1300 , 17500), |
| (5 , 1320 , 3 , 2 , 133000 , 1500 , 30000), |
| (6 , 1350 , 2 , 1 , 90500 , 820 , 25700), |
| (7 , 2790 , 3 , 2.5 , 260000 , 2130 , 25000), |
| (8 , 680 , 2 , 1 , 142500 , 1170 , 22000), |
| (9 , 1840 , 3 , 2 , 160000 , 1500 , 19000), |
| (10 , 3680 , 4 , 2 , 240000 , 2790 , 20000), |
| (11 , 1660 , 3 , 1 , 87000 , 1030 , 17500), |
| (12 , 1620 , 3 , 2 , 118600 , 1250 , 20000), |
| (13 , 3100 , 3 , 2 , 140000 , 1760 , 38000), |
| (14 , 2070 , 2 , 3 , 148000 , 1550 , 14000), |
| (15 , 650 , 3 , 1.5 , 65000 , 1450 , 12000); |
| </pre> |
| -# Train linear classification model and view the model. |
| Categorical variable is price < $100,0000. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm, houses_svm_summary; |
| SELECT madlib.svm_classification('houses', |
| 'houses_svm', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]' |
| ); |
| -- Set extended display on for easier reading of output |
| \\x on |
| SELECT * FROM houses_svm; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+-------------------------------------------------------------------------------- |
| coef | {0.103994021495116,-0.00288252192097756,0.0540748706580464,0.00131729978010033} |
| loss | 0.928463796644648 |
| norm_of_gradient | 7849.34910604307 |
| num_iterations | 100 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| -# Predict using linear model. We want to predict if house price |
| is less than $100,000. We use the training data set for prediction |
| as well, which is not usual but serves to show the syntax. The |
| predicted results are in the \e prediction column and the actual |
| data is in the \e actual column. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_pred; |
| SELECT madlib.svm_predict('houses_svm', |
| 'houses', |
| 'id', |
| 'houses_pred'); |
| \\x off |
| SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred USING (id) ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function | actual |
| ----+------+---------+------+--------+------+-------+------------+--------------------+-------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 0.211310440574799 | t |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 0.37546191651855 | t |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | t | 2.4021783278516 | t |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | t | 0.63967342411632 | t |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | f | -0.179964783767855 | f |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | f | -1.78347623159173 | t |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | f | -2.86795504439645 | f |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | t | 0.811108105668757 | f |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | f | -1.61739505790168 | f |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | f | -3.96700444824078 | f |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | f | -2.19489938920329 | t |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | f | -1.53961627668269 | f |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | f | -4.54881979553637 | f |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | f | -2.06911803381861 | f |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | t | 1.52704061329968 | t |
| (15 rows) |
| </pre> |
| Count the miss-classifications: |
| <pre class="example"> |
| SELECT COUNT(*) FROM houses_pred JOIN houses USING (id) |
| WHERE houses_pred.prediction != (houses.price < 100000); |
| </pre> |
| <pre class="result"> |
| count |
| -------+ |
| 3 |
| </pre> |
| -# Train using Gaussian kernel. This time we specify |
| the initial step size and maximum number of iterations to run. As part of the |
| kernel parameter, we choose 10 as the dimension of the space where we train |
| SVM. As a result, the model will be a 10 dimensional vector, instead |
| of 4 as in the case of linear model. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random; |
| SELECT madlib.svm_classification( 'houses', |
| 'houses_svm_gaussian', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200' |
| ); |
| \\x on |
| SELECT * FROM houses_svm_gaussian; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {-1.67275666209207,1.5191640881642,-0.503066422926727,1.33250956564454,2.23009854231314,-0.0602475029497936,1.97466397155921,2.3668779833279,0.577739846910355,2.81255996089824} |
| loss | 0.0571869097340991 |
| norm_of_gradient | 1.18281830047046 |
| num_iterations | 177 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| -# Prediction using the Gaussian model. The predicted results are |
| in the \e prediction column and the actual data is in the \e actual column. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_pred_gaussian; |
| SELECT madlib.svm_predict('houses_svm_gaussian', |
| 'houses', |
| 'id', |
| 'houses_pred_gaussian'); |
| \\x off |
| SELECT *, price < 100000 AS actual FROM houses JOIN houses_pred_gaussian USING (id) ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function | actual |
| ----+------+---------+------+--------+------+-------+------------+--------------------+-------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 1.89855833083557 | t |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 1.47736856649617 | t |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | t | 0.999999992995691 | t |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | t | 0.999999989634351 | t |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | f | -1.03645694166465 | f |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | t | 1.16430515664766 | t |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | f | -0.545622670134529 | f |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | f | -1.00000000207512 | f |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | f | -1.4748622470053 | f |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | f | -1.00085274698056 | f |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | t | 1.8614251155696 | t |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | f | -1.77616417509695 | f |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | f | -1.07759348149549 | f |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | f | -3.42475835116536 | f |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | t | 1.00000008401961 | t |
| (15 rows) |
| </pre> |
| Count the miss-classifications. Note this produces a more accurate |
| result than the linear case for this data set: |
| <pre class="example"> |
| SELECT COUNT(*) FROM houses_pred_gaussian JOIN houses USING (id) |
| WHERE houses_pred_gaussian.prediction != (houses.price < 100000); |
| </pre> |
| <pre class="result"> |
| count |
| -------+ |
| 0 |
| (1 row) |
| </pre> |
| -# In the case of an unbalanced class-size dataset, use the 'balanced' |
| parameter to classify when building the model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random; |
| SELECT madlib.svm_classification( 'houses', |
| 'houses_svm_gaussian', |
| 'price < 150000', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200, class_weight=balanced' |
| ); |
| \\x on |
| SELECT * FROM houses_svm_gaussian; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {0.891926151039837,0.169282494673541,-2.26539133689874,0.526518499596676,-0.900664505989526,0.508112011288015,-0.355474591147659,1.23127975981665,1.53694964239487,1.46496058633682} |
| loss | 0.56900274445785 |
| norm_of_gradient | 0.989597662458527 |
| num_iterations | 183 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| |
| <h4>Regression</h4> |
| |
| -# Create input data set. For regression we use part of the well |
| known abalone data set https://archive.ics.uci.edu/ml/datasets/abalone : |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone; |
| CREATE TABLE abalone (id INT, sex TEXT, length FLOAT, diameter FLOAT, height FLOAT, rings INT); |
| INSERT INTO abalone VALUES |
| (1,'M',0.455,0.365,0.095,15), |
| (2,'M',0.35,0.265,0.09,7), |
| (3,'F',0.53,0.42,0.135,9), |
| (4,'M',0.44,0.365,0.125,10), |
| (5,'I',0.33,0.255,0.08,7), |
| (6,'I',0.425,0.3,0.095,8), |
| (7,'F',0.53,0.415,0.15,20), |
| (8,'F',0.545,0.425,0.125,16), |
| (9,'M',0.475,0.37,0.125,9), |
| (10,'F',0.55,0.44,0.15,19), |
| (11,'F',0.525,0.38,0.14,14), |
| (12,'M',0.43,0.35,0.11,10), |
| (13,'M',0.49,0.38,0.135,11), |
| (14,'F',0.535,0.405,0.145,10), |
| (15,'F',0.47,0.355,0.1,10), |
| (16,'M',0.5,0.4,0.13,12), |
| (17,'I',0.355,0.28,0.085,7), |
| (18,'F',0.44,0.34,0.1,10), |
| (19,'M',0.365,0.295,0.08,7), |
| (20,'M',0.45,0.32,0.1,9); |
| </pre> |
| -# Train a linear regression model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_svm_regression, abalone_svm_regression_summary; |
| SELECT madlib.svm_regression('abalone', |
| 'abalone_svm_regression', |
| 'rings', |
| 'ARRAY[1, length, diameter, height]' |
| ); |
| \\x on |
| SELECT * FROM abalone_svm_regression; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+----------------------------------------------------------------------- |
| coef | {1.998949892503,0.918517478913099,0.712125856084095,0.229379472956877} |
| loss | 8.29033295818392 |
| norm_of_gradient | 23.225177785827 |
| num_iterations | 100 |
| num_rows_processed | 20 |
| num_rows_skipped | 0 |
| dep_var_mapping | {NULL} |
| </pre> |
| -# Predict using the linear regression model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_regr; |
| SELECT madlib.svm_predict('abalone_svm_regression', |
| 'abalone', |
| 'id', |
| 'abalone_regr'); |
| \\x off |
| SELECT * FROM abalone JOIN abalone_regr USING (id) ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex | length | diameter | height | rings | prediction | decision_function |
| ----+-----+--------+----------+--------+-------+------------------+------------------- |
| 1 | M | 0.455 | 0.365 | 0.095 | 15 | 2.69859240928376 | 2.69859240928376 |
| 2 | M | 0.35 | 0.265 | 0.09 | 7 | 2.52978857282818 | 2.52978857282818 |
| 3 | F | 0.53 | 0.42 | 0.135 | 9 | 2.81582333426116 | 2.81582333426116 |
| 4 | M | 0.44 | 0.365 | 0.125 | 10 | 2.69169603073001 | 2.69169603073001 |
| 5 | I | 0.33 | 0.255 | 0.08 | 7 | 2.50200316683054 | 2.50200316683054 |
| 6 | I | 0.425 | 0.3 | 0.095 | 8 | 2.62474869654157 | 2.62474869654157 |
| 7 | F | 0.53 | 0.415 | 0.15 | 20 | 2.81570339722408 | 2.81570339722408 |
| 8 | F | 0.545 | 0.425 | 0.125 | 16 | 2.83086793257882 | 2.83086793257882 |
| 9 | M | 0.475 | 0.37 | 0.125 | 9 | 2.72740477577673 | 2.72740477577673 |
| 10 | F | 0.55 | 0.44 | 0.15 | 19 | 2.8518768970598 | 2.8518768970598 |
| 11 | F | 0.525 | 0.38 | 0.14 | 14 | 2.78389260680315 | 2.78389260680315 |
| 12 | M | 0.43 | 0.35 | 0.11 | 10 | 2.66838827339779 | 2.66838827339779 |
| 13 | M | 0.49 | 0.38 | 0.135 | 11 | 2.75059759385832 | 2.75059759385832 |
| 14 | F | 0.535 | 0.405 | 0.145 | 10 | 2.81202782833915 | 2.81202782833915 |
| 15 | F | 0.47 | 0.355 | 0.1 | 10 | 2.70639581129576 | 2.70639581129576 |
| 16 | M | 0.5 | 0.4 | 0.13 | 12 | 2.77287839069521 | 2.77287839069521 |
| 17 | I | 0.355 | 0.28 | 0.085 | 7 | 2.54391615211472 | 2.54391615211472 |
| 18 | F | 0.44 | 0.34 | 0.1 | 10 | 2.66815839489651 | 2.66815839489651 |
| 19 | M | 0.365 | 0.295 | 0.08 | 7 | 2.56263631931732 | 2.56263631931732 |
| 20 | M | 0.45 | 0.32 | 0.1 | 9 | 2.66310105219146 | 2.66310105219146 |
| (20 rows) |
| </pre> |
| RMS error: |
| <pre class="example"> |
| SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone |
| JOIN abalone_regr USING (id); |
| </pre> |
| <pre class="result"> |
| rms_error |
| -----------------+ |
| 9.0884271818321 |
| (1 row) |
| </pre> |
| -# Train a non-linear regression model using a Gaussian kernel: |
| <pre class="example">DROP TABLE IF EXISTS abalone_svm_gaussian_regression, abalone_svm_gaussian_regression_summary, abalone_svm_gaussian_regression_random; |
| SELECT madlib.svm_regression( 'abalone', |
| 'abalone_svm_gaussian_regression', |
| 'rings', |
| 'ARRAY[1, length, diameter, height]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200' |
| ); |
| \\x on |
| SELECT * FROM abalone_svm_gaussian_regression; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {4.49016341280977,2.19062972461334,-2.04673653356154,1.11216153651262,2.83478599238881,-4.23122821845785,4.17684533744501,-5.36892552740644,0.775782561685621,-3.62606941016707} |
| loss | 2.66850539541894 |
| norm_of_gradient | 0.97440079536379 |
| num_iterations | 163 |
| num_rows_processed | 20 |
| num_rows_skipped | 0 |
| dep_var_mapping | {NULL} |
| </pre> |
| -# Predict using Gaussian regression model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_gaussian_regr; |
| SELECT madlib.svm_predict('abalone_svm_gaussian_regression', |
| 'abalone', |
| 'id', |
| 'abalone_gaussian_regr'); |
| \\x off |
| SELECT * FROM abalone JOIN abalone_gaussian_regr USING (id) ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex | length | diameter | height | rings | prediction | decision_function |
| ----+-----+--------+----------+--------+-------+------------------+------------------- |
| 1 | M | 0.455 | 0.365 | 0.095 | 15 | 9.92189555675422 | 9.92189555675422 |
| 2 | M | 0.35 | 0.265 | 0.09 | 7 | 9.81553107620013 | 9.81553107620013 |
| 3 | F | 0.53 | 0.42 | 0.135 | 9 | 10.0847384862759 | 10.0847384862759 |
| 4 | M | 0.44 | 0.365 | 0.125 | 10 | 10.0100000075406 | 10.0100000075406 |
| 5 | I | 0.33 | 0.255 | 0.08 | 7 | 9.74093262454458 | 9.74093262454458 |
| 6 | I | 0.425 | 0.3 | 0.095 | 8 | 9.94807651709641 | 9.94807651709641 |
| 7 | F | 0.53 | 0.415 | 0.15 | 20 | 10.1448936105369 | 10.1448936105369 |
| 8 | F | 0.545 | 0.425 | 0.125 | 16 | 10.0579420659954 | 10.0579420659954 |
| 9 | M | 0.475 | 0.37 | 0.125 | 9 | 10.055724626407 | 10.055724626407 |
| 10 | F | 0.55 | 0.44 | 0.15 | 19 | 10.1225030222559 | 10.1225030222559 |
| 11 | F | 0.525 | 0.38 | 0.14 | 14 | 10.160706707435 | 10.160706707435 |
| 12 | M | 0.43 | 0.35 | 0.11 | 10 | 9.95760174386841 | 9.95760174386841 |
| 13 | M | 0.49 | 0.38 | 0.135 | 11 | 10.0981242315617 | 10.0981242315617 |
| 14 | F | 0.535 | 0.405 | 0.145 | 10 | 10.1501121415596 | 10.1501121415596 |
| 15 | F | 0.47 | 0.355 | 0.1 | 10 | 9.97689437628973 | 9.97689437628973 |
| 16 | M | 0.5 | 0.4 | 0.13 | 12 | 10.0633271219326 | 10.0633271219326 |
| 17 | I | 0.355 | 0.28 | 0.085 | 7 | 9.79492924255328 | 9.79492924255328 |
| 18 | F | 0.44 | 0.34 | 0.1 | 10 | 9.94856833428783 | 9.94856833428783 |
| 19 | M | 0.365 | 0.295 | 0.08 | 7 | 9.78278863173308 | 9.78278863173308 |
| 20 | M | 0.45 | 0.32 | 0.1 | 9 | 9.98822477687532 | 9.98822477687532 |
| (20 rows) |
| </pre> |
| Compute the RMS error. Note this produces a more accurate result than |
| the linear case for this data set: |
| <pre class="example"> |
| SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone |
| JOIN abalone_gaussian_regr USING (id); |
| </pre> |
| <pre class="result"> |
| rms_error |
| ------------------+ |
| 3.83678516581768 |
| (1 row) |
| </pre> |
| -# Cross validation. Let's run cross validation for different initial step sizes and lambda values: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_svm_gaussian_regression, abalone_svm_gaussian_regression_summary, |
| abalone_svm_gaussian_regression_random, abalone_svm_gaussian_regression_cv; |
| SELECT madlib.svm_regression( 'abalone', |
| 'abalone_svm_gaussian_regression', |
| 'rings', |
| 'ARRAY[1, length, diameter, height]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=[0.01,1], n_folds=3, max_iter=200, lambda=[0.01, 0.1, 0.5], |
| validation_result=abalone_svm_gaussian_regression_cv' |
| ); |
| \\x on |
| SELECT * FROM abalone_svm_gaussian_regression; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {4.46074154389204,2.19335800415975,-2.14775901092668,1.06805891149535,2.91168496475457,-3.95521278459095,4.20496790233169,-5.28144330907061,0.427743633754918,-3.58999505728692} |
| loss | 2.68317592175908 |
| norm_of_gradient | 0.69852112502746 |
| num_iterations | 169 |
| num_rows_processed | 20 |
| num_rows_skipped | 0 |
| dep_var_mapping | {NULL} |
| </pre> |
| View the summary table showing the final model parameters are those that produced |
| the lowest error in the cross validation runs: |
| <pre class="example"> |
| SELECT * FROM abalone_svm_gaussian_regression_summary; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--------+------------------------------------------------------------------------------------ |
| method | SVR |
| version_number | 1.15-dev |
| source_table | abalone |
| model_table | abalone_svm_gaussian_regression |
| dependent_varname | rings |
| independent_varname | ARRAY[1, length, diameter, height] |
| kernel_func | gaussian |
| kernel_params | gamma=0.25, n_components=10,random_state=1, fit_intercept=False, fit_in_memory=True |
| grouping_col | NULL |
| optim_params | init_stepsize=1.0, |
| | decay_factor=0.9, |
| | max_iter=200, |
| | tolerance=1e-10, |
| | epsilon=0.01, |
| | eps_table=, |
| | class_weight= |
| reg_params | lambda=0.01, norm=l2, n_folds=3 |
| num_all_groups | 1 |
| num_failed_groups | 0 |
| total_rows_processed | 20 |
| total_rows_skipped | 0 |
| (6 rows) |
| </pre> |
| View the statistics for the various cross validation values: |
| <pre class="example"> |
| \\x off |
| SELECT * FROM abalone_svm_gaussian_regression_cv; |
| </pre> |
| <pre class="result"> |
| init_stepsize | lambda | mean_score | std_dev_score |
| ---------------+--------+----------------+---------------- |
| 1.0 | 0.01 | -4.06711568585 | 0.435966381366 |
| 1.0 | 0.1 | -4.08068428345 | 0.44660797513 |
| 1.0 | 0.5 | -4.52576046087 | 0.20597876382 |
| 0.01 | 0.01 | -11.0231044189 | 0.739956548721 |
| 0.01 | 0.1 | -11.0244799274 | 0.740029346709 |
| 0.01 | 0.5 | -11.0305445077 | 0.740350338532 |
| (6 rows) |
| </pre> |
| -# Predict using the cross-validated Gaussian regression model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_gaussian_regr; |
| SELECT madlib.svm_predict('abalone_svm_gaussian_regression', |
| 'abalone', |
| 'id', |
| 'abalone_gaussian_regr'); |
| </pre> |
| Compute the RMS error. Note this produces a more accurate result than |
| the previous run with the Gaussian kernel: |
| <pre class="example"> |
| SELECT SQRT(AVG((rings-prediction)*(rings-prediction))) as rms_error FROM abalone |
| JOIN abalone_gaussian_regr USING (id); |
| </pre> |
| <pre class="result"> |
| rms_error |
| ------------------+ |
| 3.84208909699442 |
| (1 row) |
| </pre> |
| |
| <h4>Novelty Detection</h4> |
| |
| -# Now train a non-linear one-class SVM for novelty detection, using a Gaussian kernel. |
| Note that the dependent variable is not a parameter for one-class: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_one_class_gaussian, houses_one_class_gaussian_summary, houses_one_class_gaussian_random; |
| select madlib.svm_one_class('houses', |
| 'houses_one_class_gaussian', |
| 'ARRAY[1,tax,bedroom,bath,size,lot,price]', |
| 'gaussian', |
| 'gamma=0.5,n_components=55, random_state=3', |
| NULL, |
| 'max_iter=100, init_stepsize=10,lambda=10, tolerance=0' |
| ); |
| \\x on |
| SELECT * FROM houses_one_class_gaussian; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {redacted for brevity} |
| loss | 0.944016313708205 |
| norm_of_gradient | 14.5271059047443 |
| num_iterations | 100 |
| num_rows_processed | 16 |
| num_rows_skipped | -1 |
| dep_var_mapping | {-1,1} |
| </pre> |
| -# For the novelty detection using one-class, let's create a test data |
| set using the last 3 values from the training set plus an outlier at |
| the end (10x price): |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_one_class_test; |
| CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| INSERT INTO houses_one_class_test VALUES |
| (1 , 3100 , 3 , 2 , 140000 , 1760 , 38000), |
| (2 , 2070 , 2 , 3 , 148000 , 1550 , 14000), |
| (3 , 650 , 3 , 1.5 , 65000 , 1450 , 12000), |
| (4 , 650 , 3 , 1.5 , 650000 , 1450 , 12000); |
| </pre> |
| Now run prediction on the Gaussian one-class novelty detection model. |
| Result shows the last row predicted to be novel: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_pred; |
| SELECT madlib.svm_predict('houses_one_class_gaussian', |
| 'houses_one_class_test', |
| 'id', |
| 'houses_pred'); |
| \\x off |
| SELECT * FROM houses_one_class_test JOIN houses_pred USING (id) ORDER BY id; |
| </pre> |
| Result showing the last row predicted to be novel: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function |
| ----+------+---------+------+--------+------+-------+------------+--------------------- |
| 1 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 1 | 0.111497008121437 |
| 2 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 1 | 0.0996021345169148 |
| 3 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | 1 | 0.0435064008756942 |
| 4 | 650 | 3 | 1.5 | 650000 | 1450 | 12000 | -1 | -0.0168967845338403 |
| </pre> |
| |
| @anchor background |
| @par Technical Background |
| |
| To solve linear SVM, the following objective function is minimized: |
| \f[ |
| \underset{w,b}{\text{Minimize }} \lambda||w||^2 + \frac{1}{n}\sum_{i=1}^n \ell(y_i,f_{w,b}(x_i)) |
| |
| \f] |
| |
| where \f$(x_1,y_1),\ldots,(x_n,y_n)\f$ are labeled training data and |
| \f$\ell(y,f(x))\f$ is a loss function. When performing classification, |
| \f$\ell(y,f(x)) = \max(0,1-yf(x))\f$ is the <em>hinge loss</em>. |
| For regression, the loss function \f$\ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon)\f$ |
| is used. |
| |
| If \f$ f_{w,b}(x) = \langle w, x\rangle + b\f$ is linear, then the |
| objective function is convex and incremental gradient descent (IGD, or SGD) |
| can be applied to find a global minimum. See Feng, et al. [1] for more details. |
| |
| To learn with Gaussian or polynomial kernels, the training data is first mapped |
| via a <em>random feature map</em> in such a way that the usual inner product in |
| the feature space approximates the kernel function in the input space. The |
| linear SVM training function is then run on the resulting data. See the papers |
| [2,3] for more information on random feature maps. |
| |
| Also, see the book [4] by Scholkopf and Smola for more details on SVMs in general. |
| |
| @anchor literature |
| @literature |
| |
| @anchor svm-lit-1 |
| [1] Xixuan Feng, Arun Kumar, Ben Recht, and Christopher Re: |
| Towards a Unified Architecture for in-RDBMS analytics, |
| in SIGMOD Conference, 2012 |
| http://www.eecs.berkeley.edu/~brecht/papers/12.FengEtAl.SIGMOD.pdf |
| |
| @anchor svm-lit-2 |
| [2] Purushottam Kar and Harish Karnick: Random Feature Maps for Dot |
| Product Kernels, Proceedings of the 15th International Conference |
| on Artificial Intelligence and Statistics, 2012, |
| http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2012_KarK12.pdf |
| |
| @anchor svm-lit-3 |
| [3] Ali Rahmini and Ben Recht: Random Features for Large-Scale |
| Kernel Machines, Neural Information Processing Systems 2007, |
| http://www.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf |
| |
| @anchor svm-lit-4 |
| [4] Bernhard Scholkopf and Alexander Smola: Learning with Kernels, |
| The MIT Press, Cambridge, MA, 2002. |
| |
| @anchor svm-lit-5 |
| [5] Vladimir Cherkassky and Yunqian Ma: Practical Selection of SVM Parameters |
| and Noise Estimation for SVM Regression, Neural Networks, 2004 |
| http://www.ece.umn.edu/users/cherkass/N2002-SI-SVM-13-whole.pdf |
| |
| @anchor related |
| @par Related Topics |
| |
| File svm.sql_in documenting the training function |
| |
| @internal |
| @sa Namespace SVM (documenting the driver/outer loop implemented in |
| Python), Namespace |
| \ref madlib::modules::regress documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.linear_svm_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.linear_svm_result AS ( |
| coefficients double precision[], |
| loss double precision, |
| norm_of_gradient double precision, |
| num_rows_processed bigint |
| ); |
| |
| -------------------------------------------------------------------------- |
| -- create SQL functions for IGD optimizer |
| -------------------------------------------------------------------------- |
| -- cannot be labeled as STRICT because we set previous_state NULL initially |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_transition( |
| state double precision[], |
| ind_var double precision[], |
| dep_var double precision, |
| previous_state double precision[], |
| dimension integer, |
| stepsize double precision, |
| reg double precision, |
| is_l2 boolean, |
| n_tuples integer, |
| epsilon double precision, |
| is_svc boolean, |
| tuple_weight double precision |
| ) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_merge( |
| state1 double precision[], |
| state2 double precision[]) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_final( |
| state double precision[]) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the incremental gradient |
| * method for computing linear support vector machine |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_step( |
| /*+ ind_var */ double precision[], |
| /*+ dep_var */ double precision, |
| /*+ previous_state */ double precision[], |
| /*+ dimension */ integer, |
| /*+ stepsize */ double precision, |
| /*+ reg */ double precision, |
| /*+ is_l2 */ boolean, |
| /*+ n_tuples */ integer, |
| /*+ epsilon */ double precision, |
| /*+ is_svc */ boolean, |
| /*+ tuple_weight */ double precision |
| ); |
| CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step( |
| /*+ ind_var */ double precision[], |
| /*+ dep_var */ double precision, |
| /*+ previous_state */ double precision[], |
| /*+ dimension */ integer, |
| /*+ stepsize */ double precision, |
| /*+ reg */ double precision, |
| /*+ is_l2 */ boolean, |
| /*+ n_tuples */ integer, |
| /*+ epsilon */ double precision, |
| /*+ is_svc */ boolean, |
| /*+ tuple_weight */ double precision |
| ) ( |
| STYPE=double precision[], |
| SFUNC=MADLIB_SCHEMA.linear_svm_igd_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_merge,') |
| FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_final, |
| INITCOND='{0,0,0,0,0,0,0}' |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_distance( |
| /*+ state1 */ double precision[], |
| /*+ state2 */ double precision[]) |
| RETURNS double precision AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_result( |
| /*+ state */ double precision[]) |
| RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| # indent according to PythonFunction |
| global is_svc |
| is_svc = False |
| PythonFunction(svm, svm, svm) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| ----------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| # indent according to PythonFunction |
| global is_svc |
| is_svc = True |
| PythonFunction(svm, svm, svm) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- all default value handling implemented in Python |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| ----------------------------------------------------------------- |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| PythonFunction(svm, svm, svm_one_class) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- all default value handling implemented in Python |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, $7, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, ''::text, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, ''::text, |
| ''::text, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| |
| ------ Prediction ------------------------------------------------------------- |
| /** |
| * @brief Scores the data points stored in a table using a learned linear support-vector model |
| * @param model_table Name of table where the learned model to be used is stored |
| * @param new_data_table Name of table/view containing the data points to be scored |
| * @param id_col Name of column in new_data_table containing the integer identifier of data points |
| * |
| * |
| * |
| * @param output_table Name of table to store the results |
| * |
| * @return Textual summary of the algorithm run |
| * |
| * @internal |
| * @sa This function is a wrapper for svm.svm_predict(). |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| model_table text, |
| new_data_table text, |
| id_col_name text, |
| output_table text) |
| RETURNS void AS $$ |
| PythonFunction(svm, svm, svm_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- Online Help ----------------------------------------------------------- |
| |
| /** |
| * @brief Help function |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(svm, svm, svm_predict_help) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict() |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_predict(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| message text) |
| RETURNS TEXT AS $$ |
| PythonFunction(svm, svm, svm_predict_help) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_help(schema_madlib, message, True) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_one_class_help(schema_madlib, message, True) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_help(schema_madlib, message, False) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression(''::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |