| /* ----------------------------------------------------------------------- */ |
| /** |
| * |
| * @file svm.sql_in |
| * |
| * @brief SQL functions for SVM (Poisson) |
| * @date July 2015 |
| * |
| * @sa For a brief introduction to SVM (Poisson), see the |
| * module description \ref grp_svm. |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_svm |
| |
| <div class="toc"><b>Contents</b><ul> |
| <li class="level1"><a href="#svm_classification">Classification Function</a></li> |
| <li class="level1"><a href="#svm_regression">Regression Function</a></li> |
| <li class="level1"><a href="#novelty_detection">Novelty Detection</a></li> |
| <li class="level1"><a href="#kernel_params">Kernel Parameters</a></li> |
| <li class="level1"><a href="#parameters">Other Parameters</a></li> |
| <li class="level1"><a href="#predict">Prediction Functions</a></li> |
| <li class="level1"><a href="#example">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| <li class="level1"><a href="#related">Related Topics</a></li> |
| </ul></div> |
| |
| Support Vector Machines (SVMs) are models for regression and classification |
| tasks. SVM models have two particularly desirable features: robustness in the |
| presence of noisy data and applicability to a variety of data configurations. At |
| its core, a <em>linear</em> SVM model is a hyperplane separating two |
| distinct classes of data (in the case of classification problems), in such a way |
| that the distance between the hyperplane and the nearest training data point |
| (called the <em>margin</em>) is maximized. Vectors that lie on this margin are |
| called support vectors. With the support vectors fixed, perturbations of vectors |
| beyond the margin will not affect the model; this contributes to the model’s |
| robustness. By substituting a kernel function for the usual inner product, one can |
| approximate a large variety of decision boundaries in addition to linear hyperplanes. |
| @brief Solves classification and regression problems by separating data with |
| a hyperplane or other nonlinear decision boundary. |
| |
| @anchor svm_classification |
| @par Classification Training Function |
| The SVM classification training function has the following format: |
| <pre class="syntax"> |
| svm_classification( |
| source_table, |
| model_table, |
| dependent_varname, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| \b Arguments |
| <DL class="arglist"> |
| <DT>source_table</DT> |
| <DD>TEXT. Name of the table containing the training data.</DD> |
| |
| <DT>model_table</DT> |
| <DD>TEXT. Name of the output table containing the model. Details of the output |
| tables are provided below. |
| </DD> |
| |
| <DT>dependent_varname</DT> |
| <DD> TEXT. Name of the dependent variable column. For classification, this column |
| can contain values of any type, but must assume exactly two distinct values. |
| Otherwise, an error will be thrown. |
| </DD> |
| |
| <DT>independent_varname</DT> |
| <DD>TEXT. Expression list to evaluate for the |
| independent variables. An intercept variable should not be included as part |
| of this expression. See 'fit_intercept' in the kernel params for info on |
| intercepts. Please note that expression should be able to be cast |
| to DOUBLE PRECISION[]. |
| |
| <DT>kernel_func (optional)</DT> |
| <DD>TEXT, default: 'linear'. |
| Type of kernel. Currently three kernel types are supported: 'linear', |
| 'gaussian', and 'polynomial'. The text can be any subset of the three |
| strings; for e.g., kernel_func='ga' will create a Gaussian kernel. |
| </DD> |
| |
| <DT>kernel_params (optional)</DT> |
| <DD>TEXT, defaults: NULL. |
| Parameters for non-linear kernel in a comma-separated string of key-value pairs. |
| The actual parameters differ depending on the value of \e kernel_func. |
| See the description below for details. |
| </DD> |
| |
| <DT>grouping_col (optional)</DT> |
| <DD>TEXT, default: NULL. An expression list used to group |
| the input dataset into discrete groups, which results in running one model per group. |
| Similar to the SQL "GROUP BY" clause. When this value is NULL, no |
| grouping is used and a single model is generated. Please note that |
| cross validation is not supported if grouping is used.</DD> |
| |
| <DT>params (optional)</DT> |
| <DD>TEXT, default: NULL. |
| Parameters for optimization and regularization in a comma-separated string |
| of key-value pairs. If a list of values is provided, then cross-validation |
| will be performed to select the \e best value from the list. See the |
| description below for details. |
| </DD> |
| |
| <DT>verbose (optional)</DT> |
| <DD>BOOLEAN default: FALSE. |
| Verbose output of the results of training.</DD> |
| </DL> |
| |
| <b>Output tables</b> |
| <br> |
| The model table produced by SVM contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>coef</th> |
| <td>FLOAT8. Vector of coefficients.</td> |
| </tr> |
| <tr> |
| <th>grouping_key</th> |
| <td>TEXT Identifies the group to which the datum belongs.</td> |
| </tr> |
| <tr> |
| <th>num_rows_processed</th> |
| <td>BIGINT. Numbers of rows processed.</td> |
| </tr> |
| <tr> |
| <th>num_rows_skipped</th> |
| <td>BIGINT. Numbers of rows skipped due to missing values or failures.</td> |
| </tr> |
| <tr> |
| <th>num_iterations</th> |
| <td>INTEGER. Number of iterations completed by stochastic gradient descent |
| algorithm. The algorithm either converged in this number of iterations |
| or hit the maximum number specified in the optimization parameters. </td> |
| </tr> |
| <tr> |
| <th>loss</th> |
| <td>FLOAT8. Value of the objective function of SVM. See Technical Background section below for more details.</td> |
| </tr> |
| <tr> |
| <th>norm_of_gradient</th> |
| <td>FLOAT8. Value of the L2-norm of the (sub)-gradient of the objective function.</td> |
| </tr> |
| <tr> |
| <th>__dep_var_mapping</th> |
| <td>TEXT[]. Vector of dependent variable labels. The first entry |
| corresponds to -1 and the second to +1. For internal use only.</td> |
| </tr> |
| </table> |
| |
| An auxiliary table named \<model_table\>_random is created if the kernel is |
| not linear. It contains data needed to embed test data into a random feature |
| space (see references [2,3]). This data is used internally by svm_predict |
| and not meaningful on its own to the user, so you can ignore it. |
| |
| A summary table named \<model_table\>_summary is also created, which has the following columns: |
| <table class="output"> |
| <tr> |
| <th>method</th> |
| <td>'svm'</td> |
| </tr> |
| <tr> |
| <th>version_number</th> |
| <td>Version of MADlib which was used to generate the model.</td> |
| </tr> |
| <tr> |
| <th>source_table</th> |
| <td>The data source table name.</td> |
| </tr> |
| <tr> |
| <th>model_table</th> |
| <td>The model table name.</td> |
| </tr> |
| <tr> |
| <th>dependent_varname</th> |
| <td>The dependent variable.</td> |
| </tr> |
| <tr> |
| <th>independent_varname</th> |
| <td>The independent variables.</td> |
| </tr> |
| <tr> |
| <th>kernel_func</th> |
| <td>The kernel function.</td> |
| </tr> |
| <tr> |
| <th>kernel_parameters</th> |
| <td>The kernel parameters, as well as random feature map data.</td> |
| </tr> |
| <tr> |
| <th>grouping_col</th> |
| <td>Columns on which to group.</td> |
| </tr> |
| <tr> |
| <th>optim_params</th> |
| <td>A string containing the optimization parameters.</td> |
| </tr> |
| <tr> |
| <th>reg_params</th> |
| <td>A string containing the regularization parameters.</td> |
| </tr> |
| <tr> |
| <th>num_all_groups</th> |
| <td>Number of groups in SVM training.</td> |
| </tr> |
| <tr> |
| <th>num_failed_groups</th> |
| <td>Number of failed groups in SVM training.</td> |
| </tr> |
| <tr> |
| <th>total_rows_processed</th> |
| <td>Total numbers of rows processed in all groups.</td> |
| </tr> |
| <tr> |
| <th>total_rows_skipped</th> |
| <td>Total numbers of rows skipped in all groups due to missing |
| values or failures.</td> |
| </tr> |
| </table> |
| |
| |
| @anchor svm_regression |
| @par Regression Training Function |
| The SVM regression training function has the following format: |
| <pre class="syntax"> |
| svm_regression(source_table, |
| model_table, |
| dependent_varname, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| |
| \b Arguments |
| |
| Specifications for regression are largely the same as for classification. In the |
| model table, there is no dependent variable mapping. The following |
| arguments have specifications which differ from svm_classification: |
| <DL class="arglist"> |
| <DT>dependent_varname</DT> |
| <DD>TEXT. Name of the dependent variable column. For regression, this column |
| can contain only values or expressions that can be cast to DOUBLE PRECISION. |
| Otherwise, an error will be thrown. |
| </DD> |
| <DT>params (optional)</DT> |
| <DD>TEXT, default: NULL. |
| The parameters \e epsilon and \e eps_table are only meaningful for regression. |
| See description below for more details. |
| </DD> |
| </DL> |
| |
| @anchor novelty_detection |
| @par Novelty Detection Training Function |
| The novelty detection function is a one-class SVM classifier, and has the following format: |
| <pre class="syntax"> |
| svm_one_class( |
| source_table, |
| model_table, |
| independent_varname, |
| kernel_func, |
| kernel_params, |
| grouping_col, |
| params, |
| verbose |
| ) |
| </pre> |
| \b Arguments |
| |
| Specifications for novelty detection are largely the same as for classification, |
| except the dependent variable name is not specified. The model table is the same |
| as that for classification. |
| |
| @anchor kernel_params |
| @par Kernel Parameters |
| Kernel parameters are supplied in a string containing a comma-delimited |
| list of name-value pairs. All of these named parameters are optional, and |
| their order does not matter. You must use the format "<param_name> = <value>" |
| to specify the value of a parameter, otherwise the parameter is ignored. |
| |
| <DL class="arglist"> |
| <DT><i>Parameters common to all kernels</i></dt><dd></dd> |
| <DT>fit_intercept</dt> |
| <DD>Default: True. The parameter \e fit_intercept is an indicator to add an |
| intercept to the \e independent_varname array expression. The intercept is added |
| to the end of the feature list - thus the last element of the coefficient list |
| is the intercept. |
| </DD> |
| <DT>n_components</DT> |
| <DD>Default: 2*num_features. The dimensionality of the transformed feature space. |
| A larger value lowers the variance of the estimate of the kernel but requires |
| more memory and takes longer to train.</DD> |
| <DT>random_state</DT> |
| <DD>Default: 1. Seed used by a random number generator. </DD> |
| </DL> |
| |
| <DL class="arglist"> |
| <DT><i>Parameters for 'gaussian' kernel</i></dt><dd></dd> |
| <DT>gamma</dt> |
| <DD> Default: 1/num_features. The parameter \f$\gamma\f$ in the Radius Basis Function |
| kernel, i.e., \f$\exp(-\gamma||x-y||^2)\f$. Choosing a proper value for \e gamma |
| is critical to the performance of kernel machine; e.g., while a large \e gamma |
| tends to cause overfitting, a small \e gamma will make the model too constrained |
| to capture the complexity of the data. |
| </DD> |
| </DL> |
| |
| <DL class="arglist"> |
| <DT><i>Parameters for 'polynomial' kernel</i></dt><dd></dd> |
| <DT>coef0</dt> |
| <DD>Default: 1.0. The independent term \f$q\f$ in \f$ (\langle x,y\rangle + q)^r \f$. |
| Must be larger than or equal to 0. When it is 0, the polynomial kernel is in homogeneous form. |
| </DD> |
| <DT>degree</dt> |
| <DD>Default: 3. The parameter \f$r\f$ in \f$ (\langle x,y\rangle + q)^r \f$. |
| </DD> |
| </DL> |
| |
| |
| @anchor parameters |
| @par Other Parameters |
| Parameters in this section are supplied in the \e params argument as a string |
| containing a comma-delimited list of name-value pairs. All of these named |
| parameters are optional, and their order does not matter. You must use the |
| format "<param_name> = <value>" to specify the value of a parameter, otherwise |
| the parameter is ignored. |
| |
| Hyperparameter optimization can be carried out using the built-in cross |
| validation mechanism, which is activated by assigning a value greater than 1 to |
| the parameter \e n_folds in \e params. |
| Please note that cross validation is not |
| supported if grouping is used. |
| |
| The values of a parameter to cross validate should be provided in a list. For |
| example, if one wanted to regularize with the L1 norm and use a lambda value |
| from the set {0.3, 0.4, 0.5}, one might input 'lambda={0.3, 0.4, 0.5}, norm=L1, |
| n_folds=10' in \e params. Note that the use of '{}' and '[]' are both valid |
| here. |
| @note |
| Note that not all of the parameters below can be cross-validated. For |
| parameters where cross validation is allowed, their default values are presented |
| in list format; e.g., [0.01]. |
| |
| <pre class="syntax"> |
| 'init_stepsize = <value>, |
| decay_factor = <value>, |
| max_iter = <value>, |
| tolerance = <value>, |
| lambda = <value>, |
| norm = <value>, |
| epsilon = <value>, |
| eps_table = <value>, |
| validation_result = <value>, |
| n_folds = <value>, |
| class_weight = <value>' |
| </pre> |
| \b Parameters |
| <DL class="arglist"> |
| |
| <DT>init_stepsize</dt> |
| <DD>Default: [0.01]. |
| Also known as the initial learning rate. A small value is usually desirable to |
| ensure convergence, while a large value provides more room for progress during |
| training. Since the best value depends on the condition number of the data, in |
| practice one often searches in an exponential grid using built-in cross |
| validation; e.g., "init_stepsize = [1, 0.1, 0.001]". To reduce training time, it |
| is common to run cross validation on a subsampled dataset, since this usually |
| provides a good estimate of the condition number of the whole dataset. Then the |
| resulting \e init_stepsize can be run on the whole dataset. |
| |
| </DD> |
| |
| <DT>decay_factor</DT> |
| <DD>Default: [0.9]. Control the learning rate schedule: 0 means constant rate; |
| <-1 means inverse scaling, i.e., stepsize = init_stepsize / iteration; > 0 means |
| <exponential decay, i.e., stepsize = init_stepsize * decay_factor^iteration. |
| </DD> |
| |
| <DT>max_iter</dt> |
| <DD>Default: [100]. The maximum number of iterations allowed. |
| </DD> |
| |
| <DT>tolerance</dt> |
| <DD>Default: 1e-10. The criterion to end iterations. The training stops whenever |
| <the difference between the training models of two consecutive iterations is |
| <smaller than \e tolerance or the iteration number is larger than \e max_iter. |
| </DD> |
| |
| <DT>lambda</dt> |
| <DD>Default: [0.01]. Regularization parameter. Must be non-negative. |
| </DD> |
| |
| <DT>norm</dt> |
| <DD>Default: 'L2'. Name of the regularization, either 'L2' or 'L1'. |
| </DD> |
| |
| <DT>epsilon</dt> |
| <DD>Default: [0.01]. |
| Determines the \f$\epsilon\f$ for \f$\epsilon\f$-regression. Ignored during classification. |
| When training the model, differences of less than \f$\epsilon\f$ between estimated labels |
| and actual labels are ignored. A larger \f$\epsilon\f$ will yield a model |
| with fewer support vectors, but will not generalize as well to future data. |
| Generally, it has been suggested that epsilon should increase with noisier |
| data, and decrease with the number of samples. See [5]. |
| </DD> |
| |
| <DT>eps_table</dt> |
| <DD>Default: NULL. |
| Name of the input table that contains values of epsilon for different groups. |
| Ignored when \e grouping_col is NULL. Define this input table if you want |
| different epsilon values for different groups. The table consists of a column |
| named \e epsilon which specifies the epsilon values, and one or more columns for |
| \e grouping_col. Extra groups are ignored, and groups not present in this table |
| will use the epsilon value specified in parameter \e epsilon. |
| </DD> |
| |
| <DT>validation_result</dt> |
| <DD>Default: NULL. |
| Name of the table to store the cross validation results including the values of |
| parameters and their averaged error values. For now, simple metric like 0-1 loss |
| is used for classification and mean square error is used for regression. The |
| table is only created if the name is not NULL. |
| </DD> |
| |
| <DT>n_folds</dt> |
| <DD>Default: 0. |
| Number of folds (k). Must be at least 2 to activate cross validation. |
| If a value of k > 2 is specified, each fold is then used as a validation set once, |
| while the other k - 1 folds form the training set. |
| </DD> |
| |
| <DT>class_weight</dt> |
| <DD>Default: 1 for classification, 'balanced' for one-class novelty detection, |
| n/a for regression. |
| |
| Set the weight for the positive and negative classes. If not given, all classes |
| are set to have weight one. |
| If class_weight = balanced, values of y are automatically adjusted as inversely |
| proportional to class frequencies in the input data i.e. the weights are set as |
| n_samples / (n_classes * bincount(y)). |
| |
| Alternatively, class_weight can be a mapping, giving the weight for each class. |
| Eg. For dependent variable values 'a' and 'b', the class_weight can be |
| {a: 2, b: 3}. This would lead to each 'a' tuple's y value multiplied by 2 and |
| each 'b' y value will be multiplied by 3. |
| |
| For regression, the class weights are always one. |
| </DD> |
| </DL> |
| |
| @anchor predict |
| @par Prediction Function |
| The prediction function is used to estimate the conditional mean given a new |
| predictor. The same syntax is used for classification, regression and novelty |
| detection: |
| <pre class="syntax"> |
| svm_predict(model_table, |
| new_data_table, |
| id_col_name, |
| output_table) |
| </pre> |
| |
| \b Arguments |
| <DL class="arglist"> |
| <DT>model_table</DT> |
| <DD>TEXT. Model table produced by the training function.</DD> |
| |
| <DT>new_data_table</DT> |
| <DD>TEXT. Name of the table containing the prediction data. This table is expected |
| to contain the same features that were used during training. The table should |
| also contain id_col_name used for identifying each row.</DD> |
| |
| <DT>id_col_name</DT> |
| <DD>TEXT. The name of the id column in the input table.</DD> |
| |
| <DT>output_table</DT> |
| <DD>TEXT. Name of the table where output predictions are written. If this |
| table name is already in use, then an error is returned. Table contains:</DD> |
| <table class="output"> |
| <tr> |
| <th>id</th> |
| <td>Gives the 'id' for each prediction, corresponding to each row from the new_data_table.</td> |
| </tr> |
| <tr> |
| <th>prediction</th> |
| <td>Provides the prediction for each row in new_data_table. |
| For regression this would be the same as decision_function. For classification, |
| this will be one of the dependent variable values.</td> |
| </tr> |
| <tr> |
| <th>decision_function</th> |
| <td>Provides the distance between each point and the separating hyperplane.</td> |
| </tr> |
| </DL> |
| </table> |
| @anchor example |
| @par Examples |
| -# Create an input data set. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses; |
| CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| COPY houses FROM STDIN WITH DELIMITER '|'; |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| \\. |
| </pre> |
| -# Train a classification model. First, use a linear model. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm, houses_svm_summary; |
| SELECT madlib.svm_classification('houses', |
| 'houses_svm', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]' |
| ); |
| </pre> |
| -# View the result for the linear classification model. |
| <pre class="example"> |
| -- Set extended display on for easier reading of output |
| \\x ON |
| SELECT * FROM houses_svm; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]------+--------------------------------------------------------------- |
| coef | {0.152192069515,-0.29631947495,0.0968619000065,0.362682248051} |
| loss | 601.279740124 |
| norm_of_gradient | 1300.96615851627 |
| num_iterations | 100 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| -# Next generate a nonlinear model using a Gaussian kernel. This time we specify |
| the initial step size and maximum number of iterations to run. As part of the |
| kernel parameter, we choose 10 as the dimension of the space where we train |
| SVM. A larger number will lead to a more powerful model but run the risk of |
| overfitting. As a result, the model will be a 10 dimensional vector, instead |
| of 4 as in the case of linear model, which we will verify when we examine the |
| models. |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random; |
| SELECT madlib.svm_classification( 'houses', |
| 'houses_svm_gaussian', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200' |
| ); |
| </pre> |
| -# View the results from kernel SVM for classification. |
| <pre class="example"> |
| -- Set extended display on for easier reading of output |
| \\x ON |
| SELECT * FROM houses_svm_gaussian; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]------+-------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {0.183800813574,-0.78724997813,1.54121854068,1.24432527042,4.01230959334,1.07061097224,-4.92576349408,0.437699542875,0.3128600981,-1.63880635658} |
| loss | 0.998735180388 |
| norm_of_gradient | 0.729823950583579 |
| num_iterations | 196 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| -# The regression models have a similar format (model output not shown). First, for a linear model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_regression, houses_svm_regression_summary; |
| SELECT madlib.svm_regression('houses', |
| 'houses_svm_regression', |
| 'price', |
| 'ARRAY[1, tax, bath, size]' |
| ); |
| </pre> |
| For a non-linear regression model using a Gaussian kernel: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_gaussian_regression, houses_svm_gaussian_regression_summary, houses_svm_gaussian_regression_random; |
| SELECT madlib.svm_regression( 'houses', |
| 'houses_svm_gaussian_regression', |
| 'price', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200' |
| ); |
| </pre> |
| -# Now train a non-linear one-class SVM for novelty detection, using a Gaussian kernel. |
| Note that the dependent variable is not a parameter for one-class: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_one_class_gaussian, houses_one_class_gaussian_summary, houses_one_class_gaussian_random; |
| select madlib.svm_one_class('houses', |
| 'houses_one_class_gaussian', |
| 'ARRAY[1,tax,bedroom,bath,size,lot,price]', |
| 'gaussian', |
| 'gamma=0.5,n_components=55, random_state=3', |
| NULL, |
| 'max_iter=100, init_stepsize=10,lambda=10, tolerance=0' |
| ); |
| </pre> |
| -# View the result for the Gaussian novelty detection model. |
| <pre class="example"> |
| -- Set extended display on for easier reading of output |
| \\x ON |
| SELECT * FROM houses_one_class_gaussian; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {redacted for brevity} |
| loss | 15.1053343738 |
| norm_of_gradient | 13.9133653663837 |
| num_iterations | 100 |
| num_rows_processed | 16 |
| num_rows_skipped | -1 |
| dep_var_mapping | {-1,1} |
| </pre> |
| -# Now let's look at the prediction functions. We want to predict if house price |
| is less than $100,000. In the following examples we will |
| use the training data set for prediction as well, which is not usual but serves to |
| show the syntax. The predicted results are in the \e prediction column and the |
| actual data is in the \e target column. |
| For the linear model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_pred; |
| SELECT madlib.svm_predict('houses_svm', 'houses', 'id', 'houses_pred'); |
| SELECT *, price < 100000 AS target FROM houses JOIN houses_pred USING (id) ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target |
| ----+------+---------+------+--------+------+-------+------------+--------------------+-------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 104.685894748292 | t |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 200.592436923938 | t |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | t | 378.765847404582 | t |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | t | 214.034895129328 | t |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | t | 153.227581012028 | f |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | f | -102.382793811158 | t |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | f | -53.8237999423388 | f |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | t | 223.090041223192 | f |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | f | -0.858545961972027 | f |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | f | -78.226279884182 | f |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | f | -118.078558954948 | t |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | f | -26.3388234857219 | f |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | f | -279.923699905712 | f |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | f | -50.7810508979155 | f |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | t | 333.579085875975 | t |
| </pre> |
| Prediction using the Gaussian model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_pred_gaussian; |
| SELECT madlib.svm_predict('houses_svm_gaussian', 'houses', 'id', 'houses_pred_gaussian'); |
| SELECT *, price < 100000 AS target FROM houses JOIN houses_pred_gaussian USING (id) ORDER BY id; |
| </pre> |
| This produces a more accurate result than the linear case for this small data set: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function | target |
| ----+------+---------+------+--------+------+-------+------------+-------------------+-------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | t | 1.00338548176312 | t |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | t | 1.00000000098154 | t |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | t | 0.246566699635389 | t |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | t | 1.0000000003367 | t |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | f | -1.98940593324397 | f |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | t | 3.74336995109761 | t |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | f | -1.01574407296086 | f |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | f | -1.0000000002071 | f |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | f | -3.88267069310101 | f |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | f | -3.44507576539002 | f |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | t | 2.3409866081761 | t |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | f | -3.51563221173085 | f |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | f | -1.00000000011163 | f |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | f | -1.87710363254055 | f |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | t | 1.34334834982263 | t |
| </pre> |
| -# Prediction using the linear regression model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_regr; |
| SELECT madlib.svm_predict('houses_svm_regression', 'houses', 'id', 'houses_regr'); |
| SELECT * FROM houses JOIN houses_regr USING (id) ORDER BY id; |
| </pre> |
| Result for the linear regression model: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function |
| ----+------+---------+------+--------+------+-------+------------------+------------------- |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 | 55288.6992755623 | 55288.6992755623 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 | 99978.8137019119 | 99978.8137019119 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 | 43157.5130381023 | 43157.5130381023 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 | 88098.9557296729 | 88098.9557296729 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 | 114803.884262468 | 114803.884262468 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 | 88899.5186193813 | 88899.5186193813 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 | 201108.397013076 | 201108.397013076 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 | 75004.3236915733 | 75004.3236915733 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 | 136434.749667136 | 136434.749667136 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 | 264483.856987395 | 264483.856987395 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 | 110180.048139857 | 110180.048139857 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 | 117300.841695563 | 117300.841695563 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 199229.683967752 | 199229.683967752 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 147998.930271016 | 147998.930271016 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | 84936.7661235861 | 84936.7661235861 |
| </pre> |
| For the non-linear Gaussian regression model (output not shown): |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_gaussian_regr; |
| SELECT madlib.svm_predict('houses_svm_gaussian_regression', 'houses', 'id', 'houses_gaussian_regr'); |
| SELECT * FROM houses JOIN houses_gaussian_regr USING (id) ORDER BY id; |
| </pre> |
| -# For the novelty detection using one-class, let's create a test data set using |
| the last 3 values from the training set plus an outlier at the end (10x price): |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_one_class_test; |
| CREATE TABLE houses_one_class_test (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| COPY houses_one_class_test FROM STDIN WITH DELIMITER '|'; |
| 1 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 2 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 3 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| 4 | 650 | 3 | 1.5 | 650000 | 1450 | 12000 |
| \\. |
| </pre> |
| Now run prediction on the Gaussian one-class novelty detection model: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_once_class_pred; |
| SELECT madlib.svm_predict('houses_one_class_gaussian', 'houses_one_class_test', 'id', 'houses_one_class_pred'); |
| SELECT * FROM houses_one_class_test JOIN houses_one_class_pred USING (id) ORDER BY id; |
| </pre> |
| Result showing the last row predicted to be novel: |
| <pre class="result"> |
| id | tax | bedroom | bath | price | size | lot | prediction | decision_function |
| ----+------+---------+------+--------+------+-------+------------+--------------------- |
| 1 | 3100 | 3 | 2 | 140000 | 1760 | 38000 | 1 | 0.111497008121437 |
| 2 | 2070 | 2 | 3 | 148000 | 1550 | 14000 | 1 | 0.0996021345169148 |
| 3 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 | 1 | 0.0435064008756942 |
| 4 | 650 | 3 | 1.5 | 650000 | 1450 | 12000 | -1 | -0.0168967845338403 |
| </pre> |
| -# Create a model for an unbalanced class-size dataset, then use the 'balanced' parameter |
| to classify: |
| <pre class="example"> |
| DROP TABLE IF EXISTS houses_svm_gaussian, houses_svm_gaussian_summary, houses_svm_gaussian_random; |
| SELECT madlib.svm_classification( 'houses', |
| 'houses_svm_gaussian', |
| 'price < 150000', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200, class_weight=balanced' |
| ); |
| SELECT * FROM houses_svm_gaussian; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------- |
| coef | {-0.621843913637,2.4166374426,-1.54726833725,-1.74512599505,1.16231799548,-0.54019307285,-4.14373293694,-0.623069170717,3.59669949057,-1.005501237} |
| loss | 1.87657250199 |
| norm_of_gradient | 1.41148000266816 |
| num_iterations | 174 |
| num_rows_processed | 15 |
| num_rows_skipped | 0 |
| dep_var_mapping | {f,t} |
| </pre> |
| Note that the results you get for all examples may vary with the platform you are using. |
| |
| @anchor background |
| @par Technical Background |
| |
| To solve linear SVM, the following objective function is minimized: |
| \f[ |
| \underset{w,b}{\text{Minimize }} \lambda||w||^2 + \frac{1}{n}\sum_{i=1}^n \ell(y_i,f_{w,b}(x_i)) |
| |
| \f] |
| |
| where \f$(x_1,y_1),\ldots,(x_n,y_n)\f$ are labeled training data and |
| \f$\ell(y,f(x))\f$ is a loss function. When performing classification, |
| \f$\ell(y,f(x)) = \max(0,1-yf(x))\f$ is the <em>hinge loss</em>. |
| For regression, the loss function \f$\ell(y,f(x)) = \max(0,|y-f(x)|-\epsilon)\f$ |
| is used. |
| |
| If \f$ f_{w,b}(x) = \langle w, x\rangle + b\f$ is linear, then the |
| objective function is convex and incremental gradient descent (IGD, or SGD) |
| can be applied to find a global minimum. See Feng, et al. [1] for more details. |
| |
| To learn with Gaussian or polynomial kernels, the training data is first mapped |
| via a <em>random feature map</em> in such a way that the usual inner product in |
| the feature space approximates the kernel function in the input space. The |
| linear SVM training function is then run on the resulting data. See the papers |
| [2,3] for more information on random feature maps. |
| |
| Also, see the book [4] by Scholkopf and Smola for more details on SVMs in general. |
| |
| @anchor literature |
| @literature |
| |
| @anchor svm-lit-1 |
| [1] Xixuan Feng, Arun Kumar, Ben Recht, and Christopher Re: |
| Towards a Unified Architecture for in-RDBMS analytics, |
| in SIGMOD Conference, 2012 |
| http://www.eecs.berkeley.edu/~brecht/papers/12.FengEtAl.SIGMOD.pdf |
| |
| @anchor svm-lit-2 |
| [2] Purushottam Kar and Harish Karnick: Random Feature Maps for Dot |
| Product Kernels, Proceedings of the 15th International Conference |
| on Artificial Intelligence and Statistics, 2012, |
| http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2012_KarK12.pdf |
| |
| @anchor svm-lit-3 |
| [3] Ali Rahmini and Ben Recht: Random Features for Large-Scale |
| Kernel Machines, Neural Information Processing Systems 2007, |
| http://www.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf |
| |
| @anchor svm-lit-4 |
| [4] Bernhard Scholkopf and Alexander Smola: Learning with Kernels, |
| The MIT Press, Cambridge, MA, 2002. |
| |
| @anchor svm-lit-5 |
| [5] Vladimir Cherkassky and Yunqian Ma: Practical Selection of SVM Parameters |
| and Noise Estimation for SVM Regression, Neural Networks, 2004 |
| http://www.ece.umn.edu/users/cherkass/N2002-SI-SVM-13-whole.pdf |
| |
| @anchor related |
| @par Related Topics |
| |
| File svm.sql_in documenting the training function |
| |
| @internal |
| @sa Namespace SVM (documenting the driver/outer loop implemented in |
| Python), Namespace |
| \ref madlib::modules::regress documenting the implementation in C++ |
| @endinternal |
| */ |
| |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.linear_svm_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.linear_svm_result AS ( |
| coefficients double precision[], |
| loss double precision, |
| norm_of_gradient double precision, |
| num_rows_processed bigint |
| ); |
| |
| -------------------------------------------------------------------------- |
| -- create SQL functions for IGD optimizer |
| -------------------------------------------------------------------------- |
| -- cannot be labeled as STRICT because we set previous_state NULL initially |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_transition( |
| state double precision[], |
| ind_var double precision[], |
| dep_var double precision, |
| previous_state double precision[], |
| dimension integer, |
| stepsize double precision, |
| reg double precision, |
| is_l2 boolean, |
| n_tuples integer, |
| epsilon double precision, |
| is_svc boolean, |
| tuple_weight double precision |
| ) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_merge( |
| state1 double precision[], |
| state2 double precision[]) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.linear_svm_igd_final( |
| state double precision[]) |
| RETURNS double precision[] AS 'MODULE_PATHNAME' |
| LANGUAGE C IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| /** |
| * @internal |
| * @brief Perform one iteration of the incremental gradient |
| * method for computing linear support vector machine |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.linear_svm_igd_step( |
| /*+ ind_var */ double precision[], |
| /*+ dep_var */ double precision, |
| /*+ previous_state */ double precision[], |
| /*+ dimension */ integer, |
| /*+ stepsize */ double precision, |
| /*+ reg */ double precision, |
| /*+ is_l2 */ boolean, |
| /*+ n_tuples */ integer, |
| /*+ epsilon */ double precision, |
| /*+ is_svc */ boolean, |
| /*+ tuple_weight */ double precision |
| ); |
| CREATE AGGREGATE MADLIB_SCHEMA.linear_svm_igd_step( |
| /*+ ind_var */ double precision[], |
| /*+ dep_var */ double precision, |
| /*+ previous_state */ double precision[], |
| /*+ dimension */ integer, |
| /*+ stepsize */ double precision, |
| /*+ reg */ double precision, |
| /*+ is_l2 */ boolean, |
| /*+ n_tuples */ integer, |
| /*+ epsilon */ double precision, |
| /*+ is_svc */ boolean, |
| /*+ tuple_weight */ double precision |
| ) ( |
| STYPE=double precision[], |
| SFUNC=MADLIB_SCHEMA.linear_svm_igd_transition, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.linear_svm_igd_merge,') |
| FINALFUNC=MADLIB_SCHEMA.linear_svm_igd_final, |
| INITCOND='{0,0,0,0,0,0,0}' |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_distance( |
| /*+ state1 */ double precision[], |
| /*+ state2 */ double precision[]) |
| RETURNS double precision AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_linear_svm_igd_result( |
| /*+ state */ double precision[]) |
| RETURNS MADLIB_SCHEMA.linear_svm_result AS 'MODULE_PATHNAME' |
| LANGUAGE c IMMUTABLE STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| # indent according to PythonFunction |
| global is_svc |
| is_svc = False |
| PythonFunction(svm, svm, svm) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, $8, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, $7, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, $6, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression($1, $2, $3, $4, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| ----------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| # indent according to PythonFunction |
| global is_svc |
| is_svc = True |
| PythonFunction(svm, svm, svm) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- all default value handling implemented in Python |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, $8, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, $7, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, $6, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification( |
| source_table text, |
| model_table text, |
| dependent_varname text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification($1, $2, $3, $4, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| ----------------------------------------------------------------- |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text, |
| verbose bool) |
| RETURNS void AS $$ |
| PythonFunction(svm, svm, svm_one_class) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- all default value handling implemented in Python |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text, |
| params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, $7, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text, |
| grouping_col text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, $6, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text, |
| kernel_params text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, $5, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text, |
| kernel_func text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, $4, ''::text, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class( |
| source_table text, |
| model_table text, |
| independent_varname text) |
| RETURNS void AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class($1, $2, $3, ''::text, |
| ''::text, ''::text, ''::text, FALSE); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| |
| |
| ------ Prediction ------------------------------------------------------------- |
| /** |
| * @brief Scores the data points stored in a table using a learned linear support-vector model |
| * @param model_table Name of table where the learned model to be used is stored |
| * @param new_data_table Name of table/view containing the data points to be scored |
| * @param id_col Name of column in new_data_table containing the integer identifier of data points |
| * |
| * |
| * |
| * @param output_table Name of table to store the results |
| * |
| * @return Textual summary of the algorithm run |
| * |
| * @internal |
| * @sa This function is a wrapper for svm.svm_predict(). |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| model_table text, |
| new_data_table text, |
| id_col_name text, |
| output_table text) |
| RETURNS void AS $$ |
| PythonFunction(svm, svm, svm_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA'); |
| |
| -- Online Help ----------------------------------------------------------- |
| |
| /** |
| * @brief Help function |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunction(svm, svm, svm_predict_help) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict() |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_predict(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_predict( |
| message text) |
| RETURNS TEXT AS $$ |
| PythonFunction(svm, svm, svm_predict_help) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_help(schema_madlib, message, True) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_classification () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_classification(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_one_class_help(schema_madlib, message, True) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_one_class () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_one_class(NULL::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression ( |
| message TEXT |
| ) RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(svm, svm) |
| return svm.svm_help(schema_madlib, message, False) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svm_regression () |
| RETURNS TEXT AS $$ |
| SELECT MADLIB_SCHEMA.svm_regression(''::TEXT) |
| $$ LANGUAGE SQL IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |