| from __future__ import division, print_function |
| |
| import plpy |
| |
| from collections import defaultdict |
| |
| from kernel_approximation import create_kernel, load_kernel |
| |
| from utilities.control import MinWarning |
| from utilities.in_mem_group_control import GroupIterationController |
| from utilities.utilities import _assert |
| from utilities.utilities import _string_to_array |
| from utilities.utilities import _string_to_array_with_quotes |
| from utilities.utilities import add_postfix |
| from utilities.utilities import extract_keyvalue_params |
| from utilities.utilities import get_grouping_col_str |
| from utilities.utilities import num_features, num_samples |
| from utilities.utilities import preprocess_keyvalue_params |
| from utilities.utilities import unique_string |
| |
| from utilities.validate_args import cols_in_tbl_valid |
| from utilities.validate_args import explicit_bool_to_text |
| from utilities.validate_args import get_expr_type |
| from utilities.validate_args import input_tbl_valid |
| from utilities.validate_args import is_var_valid |
| from utilities.validate_args import output_tbl_valid |
| |
| from validation.internal.cross_validation import CrossValidator |
| |
| |
| def _compute_svm(args): |
| """ |
| Compute SVM coefficients |
| |
| @return Number of iterations that has been run |
| """ |
| init_stepsize = args['init_stepsize'] |
| args['stepsize'] = init_stepsize |
| iterationCtrl = GroupIterationController(args) |
| with iterationCtrl as it: |
| it.iteration = 0 |
| has_converged = False |
| while not has_converged: |
| it.update( |
| """ |
| {schema_madlib}.linear_svm_igd_step( |
| ({col_ind_var})::FLOAT8[], |
| ({col_dep_var_trans})::FLOAT8, |
| {rel_state}.{col_grp_state}, |
| {n_features}::INT4, |
| {stepsize}::FLOAT8, |
| {lambda}::FLOAT8, |
| {is_l2}::BOOLEAN, |
| {col_n_tuples}, |
| ({select_epsilon})::FLOAT8, |
| {is_svc}::BOOLEAN, |
| {class_weight_sql}::FLOAT8 |
| ) |
| """) |
| it.info() |
| if it.kwargs['decay_factor'] > 0: |
| it.kwargs['stepsize'] *= it.kwargs['decay_factor'] |
| else: |
| it.kwargs['stepsize'] = init_stepsize / (it.iteration + 1) |
| has_converged = it.test( |
| """ |
| {iteration} >= {max_iter} |
| OR {schema_madlib}.internal_linear_svm_igd_distance( |
| _state_previous, _state_current) < {tolerance} |
| """) |
| it.final() |
| return iterationCtrl.iteration |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _verify_table(source_table, model_table, dependent_varname, |
| independent_varname, verify_dep=True, **kwargs): |
| # validate input |
| input_tbl_valid(source_table, 'SVM') |
| _assert(is_var_valid(source_table, independent_varname), |
| "SVM error: invalid independent_varname " |
| "('{independent_varname}') for source_table " |
| "({source_table})!".format(independent_varname=independent_varname, |
| source_table=source_table)) |
| |
| if verify_dep: |
| _assert(is_var_valid(source_table, dependent_varname), |
| "SVM error: invalid dependent_varname " |
| "('{dependent_varname}') for source_table " |
| "({source_table})!".format(dependent_varname=dependent_varname, |
| source_table=source_table)) |
| dep_type = get_expr_type(dependent_varname, source_table) |
| if '[]' in dep_type: |
| plpy.error("SVM error: dependent_varname cannot be of array type!") |
| |
| # validate output tables |
| output_tbl_valid(model_table, 'SVM') |
| summary_table = add_postfix(model_table, "_summary") |
| output_tbl_valid(summary_table, 'SVM') |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _verify_get_params_dict(params_dict): |
| _assert(not hasattr(params_dict['lambda'], '__len__'), |
| "SVM Error: lambda should not be a list after cross validation!") |
| _assert(not hasattr(params_dict['epsilon'], '__len__'), |
| "SVM Error: epsilon should not be a list after cross validation!") |
| _assert(not hasattr(params_dict['init_stepsize'], '__len__'), |
| "SVM Error: init_stepsize should not be a " |
| "list after cross validation!") |
| _assert(not hasattr(params_dict['decay_factor'], '__len__'), |
| "SVM Error: decay_factor should not be a " |
| "list after cross validation!") |
| _assert(not hasattr(params_dict['max_iter'], '__len__'), |
| "SVM Error: max_iter should not be a list after cross validation!") |
| return params_dict |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _build_output_tables(n_iters_run, args, **kwargs): |
| |
| transformer = args['transformer'] |
| use_transformer_for_output = args['use_transformer_for_output'] |
| if use_transformer_for_output: |
| # transformer should always be a valid object created using the transform function. |
| ot = transformer.original_table |
| independent_varname = ot['independent_varname'] |
| dependent_varname = ot['dependent_varname'] |
| source_table = ot['source_table'] |
| if not dependent_varname: |
| # an exception added for the svm_one_class where dependent_varname |
| # is artificially injected into the transformed table and does not |
| # exist in the original table. Hence we use transformed table |
| # to get the expression type |
| tt = transformer.transformed_table |
| dep_type = get_expr_type(tt['dependent_varname'], tt['source_table']) |
| else: |
| dep_type = get_expr_type(dependent_varname, source_table) |
| else: |
| source_table = args['source_table'] |
| independent_varname = args['independent_varname'] |
| dependent_varname = args['dependent_varname'] |
| dep_type = get_expr_type(dependent_varname, source_table) |
| |
| model_table = args['model_table'] |
| random_table = add_postfix(model_table, "_random") |
| transformer.save_as(random_table) |
| kernel_func = transformer.kernel_func |
| kernel_params = transformer.kernel_params |
| |
| grouping_col = args['grouping_col'] |
| col_grp_key = args['col_grp_key'] |
| if grouping_col: |
| groupby_str = "GROUP BY {0}, {1}".format(grouping_col, col_grp_key) |
| grouping_str1 = grouping_col + "," |
| using_str = "USING ({col_grp_key})".format(col_grp_key=col_grp_key) |
| else: |
| groupby_str, grouping_str1, using_str = "", "", "ON TRUE" |
| # organizing results |
| args.update(locals()) |
| model_table_query = """ |
| CREATE TABLE {model_table} AS |
| SELECT |
| {grouping_str1} |
| (result).coefficients AS coef, |
| (result).loss AS loss, |
| (result).norm_of_gradient AS norm_of_gradient, |
| {n_iters_run} AS num_iterations, |
| (result).num_rows_processed AS num_rows_processed, |
| n_tuples_including_nulls - (result).num_rows_processed |
| AS num_rows_skipped, |
| ARRAY[{mapping}]::{dep_type}[] AS dep_var_mapping |
| FROM |
| ( |
| SELECT |
| {schema_madlib}.internal_linear_svm_igd_result( |
| {col_grp_state} |
| ) AS result, |
| {col_grp_key} |
| FROM {rel_state} |
| WHERE {col_grp_iteration} = {n_iters_run} |
| ) rel_state_subq |
| JOIN |
| ( |
| SELECT |
| {grouping_str1} |
| count(*) AS n_tuples_including_nulls, |
| array_to_string(ARRAY[{grouping_str}], |
| ',' |
| ) AS {col_grp_key} |
| FROM {source_table} |
| {groupby_str} |
| ) n_tuples_including_nulls_subq |
| {using_str} |
| """.format(**args) |
| plpy.execute(model_table_query) |
| |
| # summary table |
| n_failed_groups = plpy.execute(""" |
| SELECT count(*) AS num_failed_groups |
| FROM {0} |
| WHERE coef IS NULL |
| """.format(model_table))[0]['num_failed_groups'] |
| summary_table = add_postfix(model_table, "_summary") |
| grouping_text = "NULL" if not grouping_col else grouping_col |
| plpy.execute(""" |
| CREATE TABLE {summary_table} AS |
| SELECT |
| '{method}'::text AS method, |
| '__MADLIB_VERSION__'::text AS version_number, |
| '{source_table}'::text AS source_table, |
| '{model_table}'::text AS model_table, |
| '{dependent_varname}'::text AS dependent_varname, |
| '{independent_varname}'::text AS independent_varname, |
| '{kernel_func}'::text AS kernel_func, |
| '{kernel_params}'::text AS kernel_params, |
| '{grouping_text}'::text AS grouping_col, |
| $$ init_stepsize={init_stepsize}, |
| decay_factor={decay_factor}, |
| max_iter={max_iter}, |
| tolerance={tolerance}, |
| epsilon={epsilon}, |
| eps_table={eps_table}, |
| class_weight={class_weight} |
| $$::text AS optim_params, |
| 'lambda={lambda}, norm={norm}, n_folds={n_folds}'::text |
| AS reg_params, |
| count(*)::integer AS num_all_groups, |
| {n_failed_groups}::integer AS num_failed_groups, |
| sum(num_rows_processed)::bigint AS total_rows_processed, |
| sum(num_rows_skipped)::bigint AS total_rows_skipped |
| FROM {model_table}; |
| """.format(summary_table=summary_table, |
| grouping_text=grouping_text, |
| n_failed_groups=n_failed_groups, |
| **args)) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def svm_predict_help(schema_madlib, message, **kwargs): |
| args = dict(schema_madlib=schema_madlib) |
| |
| summary = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| Prediction for SVM can be used to obtain a prediction of both the |
| boolean and continuous value of the dependent variable given a |
| value of independent variable. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.svm_predict('usage') |
| |
| For a small example on using the function: |
| SELECT {schema_madlib}.svm_predict('example') |
| """.format(**args) |
| |
| usage = """ |
| --------------------------------------------------------------------------- |
| PREDICTION |
| --------------------------------------------------------------------------- |
| The prediction function is used to estimate the conditional mean given a |
| new predictor. It has the following syntax: |
| |
| SELECT {schema_madlib}.svm_predict( |
| model_table, -- TEXT. Model table produced by |
| the training function. |
| new_data_table, -- TEXT. Name of the table containing the |
| prediction data. This table is expected to |
| contain the same features that were used during |
| training. The table should also contain |
| id_col_name used for identifying each row. |
| id_col_name, -- TEXT. The name of the id column in |
| the input table. |
| output_table -- TEXT. Name of the table where output |
| predictions are written. If this table name is |
| already in use, then an error is returned. The |
| table contains the id_col_name column giving |
| the 'id' for each prediction and the prediction |
| columns for the dependent variable. |
| ); |
| """.format(**args) |
| |
| example_usage = """ |
| --------------------------------------------------------------------------- |
| EXAMPLES |
| --------------------------------------------------------------------------- |
| - Create an input data set. |
| |
| CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT, |
| size INT, lot INT); |
| COPY houses FROM STDIN WITH DELIMITER '|'; |
| 1 | 590 | 2 | 1 | 50000 | 770 | 22100 |
| 2 | 1050 | 3 | 2 | 85000 | 1410 | 12000 |
| 3 | 20 | 3 | 1 | 22500 | 1060 | 3500 |
| 4 | 870 | 2 | 2 | 90000 | 1300 | 17500 |
| 5 | 1320 | 3 | 2 | 133000 | 1500 | 30000 |
| 6 | 1350 | 2 | 1 | 90500 | 820 | 25700 |
| 7 | 2790 | 3 | 2.5 | 260000 | 2130 | 25000 |
| 8 | 680 | 2 | 1 | 142500 | 1170 | 22000 |
| 9 | 1840 | 3 | 2 | 160000 | 1500 | 19000 |
| 10 | 3680 | 4 | 2 | 240000 | 2790 | 20000 |
| 11 | 1660 | 3 | 1 | 87000 | 1030 | 17500 |
| 12 | 1620 | 3 | 2 | 118600 | 1250 | 20000 |
| 13 | 3100 | 3 | 2 | 140000 | 1760 | 38000 |
| 14 | 2070 | 2 | 3 | 148000 | 1550 | 14000 |
| 15 | 650 | 3 | 1.5 | 65000 | 1450 | 12000 |
| \. |
| |
| - Train a classification model, using a linear model. |
| |
| SELECT {schema_madlib}.svm_classification('houses', |
| 'houses_svm', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]'); |
| |
| - Generate a nonlinear model using a Gaussian kernel. This time we |
| specify the initial step size and maximum number of iterations to run. |
| As part of the kernel parameter, we choose 10 as the dimension of the |
| space where we train SVM. A larger number will lead to a more powerful |
| model but run the risk of overfitting. As a result, the model will be a |
| 10 dimensional vector, instead of 4 as in the case of linear model. |
| |
| SELECT {schema_madlib}.svm_classification( 'houses', |
| 'houses_svm_gaussian', |
| 'price < 100000', |
| 'ARRAY[1, tax, bath, size]', |
| 'gaussian', |
| 'n_components=10', |
| '', |
| 'init_stepsize=1, max_iter=200'); |
| |
| - Use the prediction function to evaluate the models. The predicted |
| results are in the prediction column and the actual data is in the |
| target column. |
| |
| -- For the linear model: |
| SELECT {schema_madlib}.svm_predict('houses_svm', |
| 'houses', |
| 'id', |
| 'houses_pred'); |
| SELECT *, price < 100000 AS target |
| FROM houses JOIN houses_pred |
| USING (id) ORDER BY id; |
| |
| -- For the Gaussian model: |
| SELECT {schema_madlib}.svm_predict('houses_svm_gaussian', |
| 'houses', |
| 'id', |
| 'houses_pred_gaussian'); |
| SELECT *, price < 100000 AS target |
| FROM houses JOIN houses_pred_gaussian |
| USING (id) ORDER BY id; |
| """.format(**args) |
| |
| if not message: |
| return summary |
| elif message.lower() in ('usage', 'help', '?'): |
| return usage |
| elif message.lower() == 'example': |
| return example_usage |
| else: |
| return """ |
| No such option. Use "SELECT {schema_madlib}.svm_predict()" for help. |
| """.format(**args) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def svm_one_class(schema_madlib, source_table, model_table, independent_varname, |
| kernel_func, kernel_params, grouping_col, params, |
| verbose, **kwargs): |
| """ Execute the support vector one-class classification algorithm. |
| |
| The data in 'source_table' only contains independent variables. The algorithm |
| works by learning a classifier between these independent features |
| and the origin. The given data is treated as positive data and the origin |
| is treated as negative, with higher weight given to the origin to ensure |
| a balanced learning update. |
| """ |
| is_svc = True |
| dependent_varname = None |
| verbosity_level = "info" if verbose else "error" |
| with MinWarning(verbosity_level): |
| _verify_table(source_table, model_table, |
| dependent_varname, independent_varname, verify_dep=False) |
| reserved_cols =['coef', 'random_feature_data', |
| 'random_feature_data', 'loss' |
| 'num_rows_processed', 'num_rows_skipped', |
| 'norm_of_gradient', 'num_iterations'] |
| grouping_str, grouping_col = get_grouping_col_str(schema_madlib, 'SVM', |
| reserved_cols, |
| source_table, |
| grouping_col) |
| if not kernel_func: |
| kernel_func = 'gaussian' |
| else: |
| kernel_func = _get_kernel_name(kernel_func) |
| # _transform_w_kernel should always return a transformer. Since |
| # override_fit_intercept=True, it should always create a transformed_table |
| # containing a intercept along with any kernel transformation in the |
| # independent variable array |
| transformer = _transform_w_kernel(schema_madlib, source_table, |
| dependent_varname, independent_varname, |
| kernel_func, kernel_params, |
| grouping_col, override_fit_intercept=True) |
| |
| source_table = transformer.transformed_table['source_table'] |
| independent_varname = transformer.transformed_table['independent_varname'] |
| dependent_varname = transformer.transformed_table['dependent_varname'] |
| update_source_for_one_class = True |
| args = locals() |
| |
| args.update(_extract_params(schema_madlib, params)) |
| if not args['class_weight']: |
| args['class_weight'] = 'balanced' |
| _cross_validate_svm(args) |
| _svm_parsed_params(use_transformer_for_output=True, **args) |
| transformer.clear() |
| # ------------------------------------------------------------------------------ |
| |
| |
| def get_svc_params_usage_string(): |
| return """ |
| --------------------------------------------------------------------------- |
| OTHER PARAMETERS |
| --------------------------------------------------------------------------- |
| Parameters are supplied in params argument as a string |
| containing a comma-delimited list of name-value pairs. |
| |
| Hyperparameter optimization can be carried out through |
| the built-in cross validation mechanism |
| |
| init_stepsize -- Default: [0.01]. Also known as the inital learning rate. |
| decay_factor -- Default: [0.9]. |
| Control the learning rate schedule: |
| 0 means constant rate; -1 means inverse scaling, i.e., |
| stepsize = init_stepsize / iteration; |
| > 0 means exponential decay, i.e., |
| stepsize = init_stepsize * decay_factor^iteration. |
| max_iter -- Default: [100]. |
| The maximum number of iterations allowed. |
| tolerance -- Default: 1e-10. The criteria to end iterations. |
| lambda -- Default: [0.01]. Regularization parameter, positive. |
| norm -- Default: 'L2'. |
| Name of the regularization, either 'L2' or 'L1'. |
| epsilon -- Default: [0.01]. |
| Determines the $\epsilon$ for $\epsilon$-regression. |
| Ignored during classification. |
| eps_tabl -- Default: NULL. |
| Name of the table that contains values of epsilon for |
| different groups. Ignored when grouping_col is NULL. |
| validation_result -- Default: NULL. |
| Name of the table to store the cross validation results |
| including the values of parameters and |
| their averaged error values. |
| n_folds -- Default: 0. Number of folds. |
| Must be at least 2 to activate cross validation. |
| """ |
| # ------------------------------------------------------------------------------ |
| |
| |
| def get_svc_gaussian_usage_string(): |
| return """ |
| --------------------------------------------------------------------------- |
| GAUSSIAN PARAMETERS |
| --------------------------------------------------------------------------- |
| Parameters are supplied in kernel_params argument as a string |
| containing a comma-delimited list of name-value pairs. |
| gamma -- Default: 1/num_features. |
| The parameter $\gamma$ in the Radius Basis |
| Function kernel, |
| n_components -- Default: 2*num_features. |
| The dimensionality of the transformed feature space. |
| random_state -- Default: 1. Seed used by the random number generator. |
| """ |
| # ------------------------------------------------------------------------------ |
| |
| |
| def get_svc_poly_usage_string(): |
| return """ |
| --------------------------------------------------------------------------- |
| POLYNOMIAL PARAMETERS |
| --------------------------------------------------------------------------- |
| Parameters are supplied in kernel_params argument as a string |
| containing a comma-delimited list of name-value pairs. |
| |
| coef0 -- Default: 1.0. |
| The independent term q in (xTy + q)^r. |
| Must be larger or equal to 0. When it is 0, |
| the polynomial kernel is in homogeneous form. |
| degree -- Default: 3. |
| The parameter r in (xTy + q)^r. |
| n_components -- Default: 2*num_features. |
| The dimensionality of the transformed feature space. |
| A larger value lowers the variance of the estimate of |
| kernel but requires more memory and |
| takes longer to train. |
| random_state -- Default: 1. Seed used by the random number generator. |
| """ |
| |
| |
| def svm_one_class_help(schema_madlib, message, is_svc, **kwargs): |
| method = 'svm_one_class' |
| args = dict(schema_madlib=schema_madlib, method=method) |
| |
| summary = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| Support Vector Machines (SVMs) are models for regression |
| and classification tasks. |
| |
| SVM models have two particularly desirable features: |
| robustness in the presence of noisy data and applicability |
| to a variety of data configurations. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.{method}('usage') |
| """.format(**args) |
| |
| usage = """ |
| --------------------------------------------------------------------------- |
| USAGE |
| --------------------------------------------------------------------------- |
| SELECT {schema_madlib}.{method}( |
| source_table, -- name of input table |
| model_table, -- name of output model table |
| independent_varname, -- names of independent variables |
| kernel_func, -- optional, default: 'linear'. |
| supported type of kernel: 'linear', 'gaussian', |
| and 'polynomial' |
| kernel_params, -- optional, default: NULL |
| parameters for non-linear kernel in a |
| comma-separated string of key-value pairs. The |
| parameters differ depending on the value of |
| kernel_func. |
| to find out more: |
| |
| SELECT {schema_madlib}.{method}('kernel_func') |
| |
| where replace 'kernel_func' with whatever kernel |
| you are interested in, i.e., |
| |
| SELECT {schema_madlib}.{method}('gaussian') |
| |
| grouping_cols, -- optional, default NULL |
| names of columns to group-by |
| params, -- optional, default NULL |
| parameters for optimization and regularization in |
| a comma-separated string of key-value pairs. If a |
| list of values are provided, then cross- |
| validation will be performed to select the best |
| value from the list. |
| to find out more: |
| |
| SELECT {schema_madlib}.{method}('params') |
| |
| verbose -- optional, default FALSE |
| whether to print useful info |
| ); |
| |
| |
| --------------------------------------------------------------------------- |
| OUTPUT |
| --------------------------------------------------------------------------- |
| The model table produced by svm contains the following columns: |
| |
| coef FLOAT8, -- vector of the coefficients. |
| grouping_key TEXT, -- identifies the group to which |
| the datum belongs. |
| num_rows_processed BIGINT, -- numbers of rows processed. |
| num_rows_skipped BIGINT, -- numbers of rows skipped due |
| to missing values or failures. |
| num_iterations INTEGER, -- number of iterations completed by |
| the optimization algorithm. |
| The algorithm either converged in this |
| number of iterations or hit the maximum |
| number specified in the |
| optimization parameters. |
| loss FLOAT8, -- value of the objective function of |
| SVM. See Technical Background section |
| below for more details. |
| norm_of_gradient FLOAT8, -- value of the L2-norm of the |
| (sub)-gradient of the objective |
| function. |
| __dep_var_mapping TEXT[], -- vector of dependendent variable labels. |
| The first entry will correspond to -1 |
| and the second to +1, for internal use. |
| Since the input table does not have an |
| dependendent variable, a new column is |
| created while learning the one-class SVM |
| model. |
| |
| An auxiliary table named <model_table>_random is created if the kernel is not |
| linear. It contains data needed to embed test data into random feature space |
| (see reference [2,3]). This data is used internally by svm_predict and not |
| meaningful on its own. |
| |
| A summary table named <model_table>_summary is also created at the same time, |
| which has the following columns: |
| method varchar, -- 'svm' |
| version_number varchar, -- version of madlib which was used to |
| generate the model. |
| source_table varchar, -- the data source table name. |
| model_table varchar, -- the model table name. |
| dependent_varname varchar, -- the dependent variable, created automatically. |
| independent_varname varchar, -- the independent variables. |
| kernel_func varchar, -- the kernel function. |
| kernel_parameters varchar, -- the kernel parameters. |
| grouping_col varchar, -- columns on which to group. |
| optim_params varchar, -- a string containing the |
| optimization parameters. |
| reg_params varchar, -- a string containing the |
| regularization parameters. |
| num_all_groups integer, -- number of groups in glm training. |
| num_failed_groups integer, -- number of failed groups in glm training. |
| total_rows_processed integer, -- total numbers of rows processed |
| in all groups. |
| total_rows_skipped integer, -- numbers of rows skipped in all groups |
| due to missing values or failures. |
| """.format(**args) |
| |
| params_usage = get_svc_params_usage_string() |
| gaussian_usage = get_svc_gaussian_usage_string() |
| poly_usage = get_svc_poly_usage_string() |
| |
| |
| if not message: |
| return summary |
| elif message.lower() in ('usage', 'help', '?'): |
| return usage |
| elif message.lower() == 'params': |
| return params_usage |
| elif message.lower() == 'gaussian': |
| return gaussian_usage |
| elif message.lower() == 'polynomial': |
| return poly_usage |
| else: |
| return """ |
| No such option. Use "SELECT {schema_madlib}.{method}()" for help. |
| """.format(**args) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def svm_help(schema_madlib, message, is_svc, **kwargs): |
| method = 'svm_classification' if is_svc else 'svm_regression' |
| |
| args = dict(schema_madlib=schema_madlib, method=method) |
| |
| summary = """ |
| ---------------------------------------------------------------- |
| SUMMARY |
| ---------------------------------------------------------------- |
| Support Vector Machines (SVMs) are models for regression |
| and classification tasks. |
| |
| SVM models have two particularly desirable features: |
| robustness in the presence of noisy data and applicability |
| to a variety of data configurations. |
| |
| For more details on function usage: |
| SELECT {schema_madlib}.{method}('usage') |
| |
| For a small example on using the function: |
| SELECT {schema_madlib}.{method}('example') |
| """.format(**args) |
| |
| usage = """ |
| --------------------------------------------------------------------------- |
| USAGE |
| --------------------------------------------------------------------------- |
| SELECT {schema_madlib}.{method}( |
| source_table, -- name of input table |
| model_table, -- name of output model table |
| dependent_varname, -- name of dependent variable |
| independent_varname, -- names of independent variables |
| kernel_func, -- optional, default: 'linear'. |
| supported type of kernel: 'linear', 'gaussian', |
| and 'polynomial' |
| kernel_params, -- optional, default: NULL |
| parameters for non-linear kernel in a |
| comma-separated string of key-value pairs. The |
| parameters differ depending on the value of |
| kernel_func. |
| to find out more: |
| |
| SELECT {schema_madlib}.{method}('kernel_func') |
| |
| where replace 'kernel_func' with whatever kernel |
| you are interested in, i.e., |
| |
| SELECT {schema_madlib}.{method}('gaussian') |
| |
| grouping_cols, -- optional, default NULL |
| names of columns to group-by |
| params, -- optional, default NULL |
| parameters for optimization and regularization in |
| a comma-separated string of key-value pairs. If a |
| list of values are provided, then cross- |
| validation will be performed to select the best |
| value from the list. |
| to find out more: |
| |
| SELECT {schema_madlib}.{method}('params') |
| |
| verbose -- optional, default FALSE |
| whether to print useful info |
| ); |
| |
| |
| --------------------------------------------------------------------------- |
| OUTPUT |
| --------------------------------------------------------------------------- |
| The model table produced by svm contains the following columns: |
| |
| coef FLOAT8, -- vector of the coefficients. |
| grouping_key TEXT, -- identifies the group to which |
| the datum belongs. |
| num_rows_processed BIGINT, -- numbers of rows processed. |
| num_rows_skipped BIGINT, -- numbers of rows skipped due |
| to missing values or failures. |
| num_iterations INTEGER, -- number of iterations completed by |
| the optimization algorithm. |
| The algorithm either converged in this |
| number of iterations or hit the maximum |
| number specified in the |
| optimization parameters. |
| loss FLOAT8, -- value of the objective function of |
| SVM. See Technical Background section |
| below for more details. |
| norm_of_gradient FLOAT8, -- value of the L2-norm of the |
| (sub)-gradient of the objective |
| function. |
| __dep_var_mapping TEXT[], -- vector of dependendent variable labels. |
| The first entry will correspond to -1 |
| and the second to +1, for internal use. |
| |
| An auxiliary table named <model_table>_random is created if the kernel is not |
| linear. It contains data needed to embed test data into random feature space |
| (see reference [2,3]). This data is used internally by svm_predict and not |
| meaningful on its own. |
| |
| A summary table named <model_table>_summary is also created at the same time, |
| which has the following columns: |
| method varchar, -- 'svm' |
| version_number varchar, -- version of madlib which was used to |
| generate the model. |
| source_table varchar, -- the data source table name. |
| model_table varchar, -- the model table name. |
| dependent_varname varchar, -- the dependent variable. |
| independent_varname varchar, -- the independent variables. |
| kernel_func varchar, -- the kernel function. |
| kernel_parameters varchar, -- the kernel parameters. |
| grouping_col varchar, -- columns on which to group. |
| optim_params varchar, -- a string containing the |
| optimization parameters. |
| reg_params varchar, -- a string containing the |
| regularization parameters. |
| num_all_groups integer, -- number of groups in glm training. |
| num_failed_groups integer, -- number of failed groups in glm training. |
| total_rows_processed integer, -- total numbers of rows processed |
| in all groups. |
| total_rows_skipped integer, -- numbers of rows skipped in all groups |
| due to missing values or failures. |
| """.format(**args) |
| |
| params_usage = get_svc_params_usage_string() |
| gaussian_usage = get_svc_gaussian_usage_string() |
| poly_usage = get_svc_poly_usage_string() |
| |
| if not message: |
| return summary |
| elif message.lower() in ('usage', 'help', '?'): |
| return usage |
| elif message.lower() == 'params': |
| return params_usage |
| elif message.lower() == 'gaussian': |
| return gaussian_usage |
| elif message.lower() == 'polynomial': |
| return poly_usage |
| else: |
| return """ |
| No such option. Use "SELECT {schema_madlib}.{method}()" for help. |
| """.format(**args) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def svm(schema_madlib, source_table, model_table, |
| dependent_varname, independent_varname, kernel_func, |
| kernel_params, grouping_col, params, is_svc, |
| verbose, **kwargs): |
| """ |
| Executes the linear support vector classification algorithm. |
| """ |
| # verbosing |
| verbosity_level = "warning" if verbose else "error" |
| with MinWarning(verbosity_level): |
| _verify_table(source_table, model_table, |
| dependent_varname, independent_varname) |
| reserved_cols =['coef', 'random_feature_data', |
| 'random_feature_data', 'loss' |
| 'num_rows_processed', 'num_rows_skipped', |
| 'norm_of_gradient', 'num_iterations'] |
| grouping_str, grouping_col = \ |
| get_grouping_col_str(schema_madlib, 'SVM', reserved_cols, |
| source_table, grouping_col) |
| kernel_func = _get_kernel_name(kernel_func) |
| transformer = _transform_w_kernel(schema_madlib, source_table, |
| dependent_varname, independent_varname, |
| kernel_func, kernel_params, |
| grouping_col) |
| args = locals() |
| args.update(_extract_params(schema_madlib, params)) |
| if transformer.transformed_table: |
| args.update(transformer.transformed_table) |
| _cross_validate_svm(args) |
| _svm_parsed_params(use_transformer_for_output=True, **args) |
| transformer.clear() |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _cross_validate_svm(args): |
| # updating params_dict will also update args['params_dict'] |
| params_dict = args |
| |
| if params_dict['n_folds'] > 1 and args['grouping_col']: |
| plpy.error('SVM Error: cross validation ' |
| 'with grouping is not supported!') |
| |
| cv_params = {} |
| if len(params_dict['lambda']) > 1: |
| cv_params['lambda'] = params_dict['lambda'] |
| else: |
| params_dict['lambda'] = params_dict['lambda'][0] |
| if len(params_dict['epsilon']) > 1 and not args['is_svc']: |
| cv_params['epsilon'] = params_dict['epsilon'] |
| else: |
| params_dict['epsilon'] = params_dict['epsilon'][0] |
| if len(params_dict['init_stepsize']) > 1: |
| cv_params['init_stepsize'] = params_dict['init_stepsize'] |
| else: |
| params_dict['init_stepsize'] = params_dict['init_stepsize'][0] |
| if len(params_dict['max_iter']) > 1: |
| cv_params['max_iter'] = params_dict['max_iter'] |
| else: |
| params_dict['max_iter'] = params_dict['max_iter'][0] |
| if len(params_dict['decay_factor']) > 1: |
| cv_params['decay_factor'] = params_dict['decay_factor'] |
| else: |
| params_dict['decay_factor'] = params_dict['decay_factor'][0] |
| |
| if not cv_params and params_dict['n_folds'] <= 1: |
| # no cross validation |
| return |
| |
| if cv_params and params_dict['n_folds'] <= 1: |
| plpy.error("SVM Error: All parameters must be scalar " |
| "or of length 1 when n_folds is 0 or 1") |
| |
| if not cv_params and params_dict['n_folds'] > 1: |
| plpy.warning('SVM Warning: n_folds > 1 but no cross validate params provided' |
| 'Ignoring cross validation request.') |
| return |
| |
| scorer = 'classification' if args['is_svc'] else 'regression' |
| # svm in cross validation should not transform the data, |
| # since test data in cross validation comes from the transformed source table. |
| # A linear transformer without intercept is a no-op transformer. |
| no_op_kernel = create_kernel(args['schema_madlib'], 0, |
| 'linear', {'fit_intercept': False}) |
| no_op_transformer = no_op_kernel.transform(args['source_table'], |
| args['independent_varname'], |
| args['dependent_varname']) |
| transformer = args.get('transformer', no_op_transformer) |
| args.update(dict(transformer=no_op_transformer)) |
| cv = CrossValidator(_svm_parsed_params, svm_predict, scorer, args) |
| val_res = cv.validate(cv_params, params_dict['n_folds']) |
| val_res.output_tbl(params_dict['validation_result']) |
| params_dict.update(val_res.top('sub_args')) |
| args.update(dict(transformer=transformer)) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _get_kernel_name(kernel_func): |
| if not kernel_func: |
| kernel_func = 'linear' |
| else: |
| # Add non-linear kernels below after implementing them. |
| supported_kernels = ['linear', 'gaussian', 'polynomial'] |
| try: |
| # allow user to specify a prefix substring of |
| # supported kernels. This works because the supported |
| # kernels have unique prefixes. |
| kernel_func = next(x for x in supported_kernels |
| if x.startswith(kernel_func)) |
| except StopIteration: |
| # next() returns a StopIteration if no element found |
| plpy.error("SVM Error: Invalid kernel function: " |
| "{0}. Supported kernel functions are ({1})" |
| .format(kernel_func, ','.join(sorted(supported_kernels)))) |
| return kernel_func |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _transform_w_kernel(schema_madlib, source_table, dependent_varname, |
| independent_varname, kernel_func, |
| kernel_params, grouping_col, override_fit_intercept=False): |
| """ Transform source table with a kernel function and return the transfomer. |
| |
| Args: |
| @param schema_madlib: str, Name of the MADlib schema |
| @param source_table: str, Name of the table with input data |
| @param dependent_varname: str, Name of the column containing response variable |
| @param independent_varname: str, Name of the column containing feature variables |
| @param kernel_func: str, Name of the kernel to apply |
| @param kernel_params: str, Key-value set of parameters for the kernel class |
| @param grouping_col: str, Comma-separated list of grouping column names |
| @param override_fit_intercept: bool, If True, the fit_intercept parameter |
| in kernel_params is always set to True |
| independent of user input. No-op if |
| this is False. |
| """ |
| n_features = num_features(source_table, independent_varname) |
| kernel_params_dict = _extract_kernel_params(kernel_params, n_features) |
| if override_fit_intercept: |
| kernel_params_dict['fit_intercept'] = True |
| transformer = create_kernel(schema_madlib, n_features, |
| kernel_func, kernel_params_dict) |
| return (transformer.fit(n_features). |
| transform(source_table, independent_varname, |
| dependent_varname, grouping_col)) |
| # ------------------------------------------------------------------------------ |
| |
| |
| def _compute_class_weight_sql(source_table, dependent_varname, |
| is_svc, class_weight_str): |
| """ |
| Args: |
| @param is_svc: Boolean, indicates if classification or regression |
| |
| Returns: |
| str. String when executed in SQL computes the class weight for each tuple |
| """ |
| if not is_svc or not class_weight_str: |
| return "1" |
| |
| dep_to_weight = defaultdict(float) |
| class_weight_str = class_weight_str.strip() |
| if class_weight_str == "balanced": |
| # use half of n_samples since only doing binary classification |
| # Change the '2' to n_classes for multinomial |
| n_samples_per_class = num_samples(source_table) / 2 |
| bin_count = plpy.execute("""SELECT {dep} as k, count(*) as v |
| FROM {src} |
| GROUP BY {dep} |
| """.format(dep=dependent_varname, |
| src=source_table)) |
| for each_count in bin_count: |
| dep_to_weight[each_count['k']] = n_samples_per_class / each_count['v'] |
| elif _is_class_weights_str_a_mapping(class_weight_str): |
| # preprocess_keyvalue_params() does not seem to handle special |
| # chars as expected. TODO: Fix it in MADLIB-1354. |
| class_weight_splits = preprocess_keyvalue_params( |
| class_weight_str, split_char=':') |
| |
| _assert(class_weight_splits and len(class_weight_splits)<=2, |
| "SVM: Only binary classification is supported. The " |
| "class_weight param should have at least one and at most " |
| "two labels in it.") |
| # Cast the distinct class values' array to a text array since a |
| # numeric class will show up with suffix 'L' sometimes, and that |
| # may cause issues when we try to check if a class level specified |
| # in class_weight (a string) exists in the distinct class levels |
| # or not. |
| distinct_class_levels = plpy.execute(""" |
| SELECT array_agg(DISTINCT({0}))::TEXT[] AS labels |
| FROM {1} |
| """.format(dependent_varname, source_table))[0]['labels'] |
| for each_pair in class_weight_splits: |
| k, v = each_pair.split(":") |
| _assert(k in distinct_class_levels, |
| "SVM: Key '{0}' in '{1}' is not a valid class label.". |
| format(k, class_weight_str)) |
| try: |
| dep_to_weight[k.strip()] = float(v.strip()) |
| except ValueError: |
| plpy.error("SVM: Weights for a class label must be numeric." |
| " Invalid class_weights param ({0})".format( |
| class_weight_str)) |
| else: |
| plpy.error("SVM: Invalid class_weight param ({0})".format( |
| class_weight_str)) |
| |
| class_weight_sql = "CASE " |
| for k, v in dep_to_weight.items(): |
| class_weight_sql += ("WHEN {dep}=$madlib${k}$madlib$ THEN {v}::FLOAT8 \n". |
| format(dep=dependent_varname, k=k, v=v)) |
| class_weight_sql += "ELSE 1.0 END" |
| return class_weight_sql |
| # ------------------------------------------------------------------------- |
| |
| def _is_class_weights_str_a_mapping(class_weight_str): |
| """ |
| Check if the class_weight_str begins with a '{' and ends with a '}' |
| """ |
| return len(class_weight_str)>2 and class_weight_str[0]=='{' and \ |
| class_weight_str[-1]=='}' |
| |
| |
| def _svm_parsed_params(schema_madlib, source_table, model_table, |
| dependent_varname, independent_varname, |
| transformer, grouping_str, |
| grouping_col, is_svc, |
| use_transformer_for_output=False, |
| update_source_for_one_class=False, |
| verbose=False, **kwargs): |
| """ |
| Executes the linear support vector algorithm. |
| |
| Args: |
| @param use_transformer_for_output: bool, |
| This variable decides if the output tables are created using either |
| the 'args' supplied in this function or the 'original_table' |
| structure in the transformer. This is necessary to allow creating |
| temporary output tables from cross validation which are different |
| from the 'original_table' used in the transformer. |
| @param update_source_for_one_class: bool, |
| This is a special indicator added here for svm_one_class. This has |
| to be placed here instead of the svm_one_class function so that |
| cross validation undergoes the same transformation for its split |
| datasets. |
| |
| """ |
| n_features = num_features(source_table, independent_varname) |
| if update_source_for_one_class: |
| # This block is run only when the caller is svm_one_class |
| |
| # Create a temporary relation with a dependent variable and insert |
| # the origin into kernel space. Kernel adds an intercept at the end of the |
| # independent_varname. Here an origin is added to the source table, with |
| # the final value set to 1. |
| dependent_varname = unique_string(desp='dep_var') |
| source_w_origin = unique_string(desp='src_tbl') |
| plpy.execute(""" |
| CREATE TEMP VIEW {source_w_origin} AS |
| SELECT {independent_varname}, |
| 1.0 AS {dependent_varname} |
| FROM {source_table} |
| UNION |
| SELECT |
| array_append( |
| {schema_madlib}.array_fill( |
| {schema_madlib}.array_of_float({n_features} - 1), |
| 0::float)::float[], |
| 1::float |
| ) as {independent_varname}, |
| -1::float as {dependent_varname} |
| """.format(**locals())) |
| source_table = source_w_origin |
| if transformer.transformed_table: |
| transformer.transformed_table.update( |
| dict(source_table=source_w_origin, |
| dependent_varname=dependent_varname)) |
| # args.update(transformer.transformed_table) |
| |
| class_weight_sql = _compute_class_weight_sql(source_table, |
| dependent_varname, |
| is_svc, |
| kwargs['class_weight']) |
| |
| args = locals() |
| args.update({ |
| 'rel_args': unique_string(desp='rel_args'), |
| 'rel_state': unique_string(desp='rel_state'), |
| 'col_grp_iteration': unique_string(desp='col_grp_iteration'), |
| 'col_grp_state': unique_string(desp='col_grp_state'), |
| 'col_grp_key': unique_string(desp='col_grp_key'), |
| 'col_n_tuples': unique_string(desp='col_n_tuples'), |
| 'state_type': "double precision[]", |
| 'rel_source': args['source_table'], |
| 'col_ind_var': args['independent_varname'], |
| 'col_dep_var': args['dependent_varname'], |
| }) |
| |
| args.update(_verify_get_params_dict(kwargs)) |
| args.update(_process_epsilon(is_svc, args)) |
| args.update(_svc_or_svr(is_svc, source_table, dependent_varname)) |
| |
| # place holder for compatibility |
| plpy.execute("DROP TABLE IF EXISTS {0}".format(args['rel_args'])) |
| plpy.execute("CREATE TABLE pg_temp.{0} AS SELECT 1".format(args['rel_args'])) |
| # actual iterative algorithm computation |
| n_iters_run = _compute_svm(args) |
| _build_output_tables(n_iters_run, args, **kwargs) |
| # ----------------------------------------------------------------------------- |
| |
| |
| def svm_predict(schema_madlib, model_table, new_data_table, id_col_name, |
| output_table, **kwargs): |
| """ Score data points stored in a table using a learned support vector model. |
| |
| @param model_table Name of learned model |
| @param new_data_table Name of table/view containing the data |
| points to be scored |
| @param id_col_name Name of column in source_table containing |
| (integer) identifier for data point |
| @param output_table Name of table to store the results |
| """ |
| with MinWarning("warning"): |
| # model table |
| input_tbl_valid(model_table, 'SVM') |
| cols_in_tbl_valid(model_table, ['coef'], 'SVM') |
| # summary table |
| summary_table = add_postfix(model_table, "_summary") |
| input_tbl_valid(summary_table, 'SVM') |
| cols_in_tbl_valid(summary_table, |
| ['dependent_varname', 'independent_varname', |
| 'kernel_func', 'kernel_params', 'grouping_col'], |
| 'SVM') |
| |
| # read necessary info from summary |
| summary = plpy.execute(""" |
| SELECT |
| method, dependent_varname, independent_varname, |
| kernel_func, kernel_params, grouping_col |
| FROM {summary_table} |
| """.format(**locals()))[0] |
| method = summary['method'] |
| dependent_varname = summary['dependent_varname'] |
| independent_varname = summary['independent_varname'] |
| kernel_func = summary['kernel_func'] |
| kernel_params = summary['kernel_params'] |
| grouping_col = summary['grouping_col'] |
| grouping_col = None if grouping_col == 'NULL' else grouping_col |
| |
| input_tbl_valid(new_data_table, 'SVM') |
| reserved_cols =['coef', 'random_feature_data', |
| 'random_feature_data', 'loss' |
| 'num_rows_processed', 'num_rows_skipped', |
| 'norm_of_gradient', 'num_iterations'] |
| grouping_str, grouping_col = get_grouping_col_str( |
| schema_madlib, 'SVM', reserved_cols, |
| new_data_table, grouping_col) |
| _assert(is_var_valid(new_data_table, independent_varname), |
| "SVM Error: independent_varname ('" + independent_varname + |
| "') is invalid for new_data_table (" + new_data_table + ")!") |
| _assert(id_col_name is not None, "SVM Error: id_col_name is NULL!") |
| _assert(is_var_valid(new_data_table, id_col_name), |
| "SVM Error: id_col_name ('" + id_col_name + |
| "') is invalid for new_data_table (" + new_data_table + ")!") |
| output_tbl_valid(output_table, 'SVM') |
| |
| kernel_params_dict = _extract_kernel_params(kernel_params) |
| random_table = add_postfix(model_table, '_random') |
| if kernel_func.lower() != 'linear': |
| # random table is not created with the linear kernel and ignored |
| # in the load_kernel call, hence we disable the check for 'linear' |
| input_tbl_valid(random_table, 'SVM') |
| transformer = load_kernel(schema_madlib, random_table, |
| kernel_func, kernel_params_dict) |
| transformer.transform(new_data_table, independent_varname, |
| grouping_col=grouping_col, id_col=id_col_name) |
| if transformer.transformed_table: |
| data_rel_info = transformer.transformed_table |
| else: |
| data_rel_info = transformer.original_table |
| new_data_table = data_rel_info['source_table'] |
| independent_varname = data_rel_info['independent_varname'] |
| dependent_varname = data_rel_info['dependent_varname'] |
| |
| pred_dist = """{0}.array_dot(coef::double precision [], |
| {1}::double precision []) |
| """.format(schema_madlib, independent_varname) |
| if method.upper() == 'SVC': |
| pred_query = """ |
| CASE WHEN {schema_madlib}.array_dot( |
| coef::double precision [], |
| {independent_varname}::double precision [] |
| ) >= 0 |
| THEN dep_var_mapping[2] |
| ELSE dep_var_mapping[1] |
| END |
| """.format(schema_madlib=schema_madlib, |
| independent_varname=independent_varname) |
| elif method.upper() == 'SVR': |
| pred_query = pred_dist |
| else: |
| plpy.error("SVM Error: Invalid 'method' value in summary table. " |
| "'method' can only be SVC or SVR!") |
| |
| if grouping_col: |
| sql = """ |
| CREATE TABLE {output_table} AS |
| SELECT |
| {id_col_name} AS {id_col_name}, |
| {pred_query} AS prediction, |
| {pred_dist} AS decision_function, |
| ARRAY[{grouping_str}] as grouping_col, |
| {grouping_col} |
| FROM {model_table} |
| JOIN {new_data_table} |
| USING ({grouping_col}) |
| WHERE not {schema_madlib}.array_contains_null({independent_varname}) |
| ORDER BY grouping_col, {id_col_name} |
| """.format(**locals()) |
| else: |
| sql = """ |
| CREATE TABLE {output_table} AS |
| SELECT |
| {id_col_name} AS {id_col_name}, |
| {pred_query} as prediction, |
| {pred_dist} AS decision_function |
| FROM |
| {model_table}, |
| {new_data_table} |
| WHERE |
| not {schema_madlib}.array_contains_null({independent_varname}) |
| """.format(**locals()) |
| plpy.execute(sql) |
| transformer.clear() |
| # ----------------------------------------------------------------------------- |
| |
| |
| def _svc_or_svr(is_svc, source_table, dependent_varname): |
| # transform col_dep_var to binary (1`or -1) if classification |
| _args = {'col_dep_var_trans': dependent_varname, |
| 'mapping': 'NULL', |
| 'method': 'SVR'} |
| |
| if is_svc: |
| # dependent variable mapping |
| dep_labels = plpy.execute(""" |
| SELECT {dependent_varname} AS y |
| FROM {source_table} |
| WHERE ({dependent_varname}) IS NOT NULL |
| GROUP BY ({dependent_varname}) |
| ORDER BY ({dependent_varname}) |
| """.format(source_table=source_table, |
| dependent_varname=dependent_varname)) |
| |
| dep_var_mapping = ["'{0}'".format(d['y']) |
| if isinstance(d['y'], basestring) |
| else str(d['y']) for d in dep_labels] |
| |
| _assert(1 <= len(dep_var_mapping) <= 2, |
| "SVM Error: Classification currently " |
| "only supports unary or binary output!. Found values {0}". |
| format(dep_var_mapping)) |
| |
| col_dep_var_trans = (""" |
| CASE WHEN ({col_dep_var}) IS NULL THEN NULL |
| WHEN ({col_dep_var}) = {mapped_value_for_negative} THEN -1.0 |
| ELSE 1.0 |
| END |
| """.format(col_dep_var=dependent_varname, |
| mapped_value_for_negative=dep_var_mapping[0])) |
| _args.update({ |
| 'mapped_value_for_negative': dep_var_mapping[0], |
| 'col_dep_var_trans': col_dep_var_trans, |
| 'mapping': dep_var_mapping[0] + "," + dep_var_mapping[1], |
| 'method': 'SVC'}) |
| return _args |
| # ----------------------------------------------------------------------------- |
| |
| |
| def _process_epsilon(is_svc, args): |
| eps_table = args['eps_table'] |
| grouping_col = args['grouping_col'] |
| grouping_str = args['grouping_str'] |
| col_grp_key = args['col_grp_key'] |
| rel_source = args['rel_source'] |
| epsilon = args['epsilon'] |
| rel_epsilon = '' |
| select_epsilon = '{0}'.format(epsilon) |
| as_rel_source = '_src' |
| |
| if not is_svc and grouping_col and eps_table: |
| rel_epsilon = unique_string(desp='rel_epsilon') |
| input_tbl_valid(eps_table, 'SVM') |
| _assert(is_var_valid(eps_table, grouping_col), |
| "SVM Error: invalid column names ('{grouping_col}') " |
| "for eps_table ('{eps_table}')!" |
| .format(grouping_col=grouping_col, |
| eps_table=eps_table)) |
| plpy.execute(""" |
| DROP TABLE IF EXISTS {rel_epsilon}; |
| CREATE TEMPORARY TABLE {rel_epsilon} AS ( |
| SELECT |
| {col_grp_key}, |
| coalesce(epsilon, {epsilon}) AS epsilon |
| FROM ( |
| SELECT |
| array_to_string(ARRAY[{grouping_str}], ',') AS |
| {col_grp_key} |
| FROM |
| {rel_source} |
| GROUP BY {grouping_col} |
| ) q1 |
| LEFT JOIN |
| ( |
| SELECT |
| array_to_string(ARRAY[{grouping_str}], ',') AS |
| {col_grp_key}, |
| epsilon |
| FROM {eps_table} |
| ) q2 |
| USING ({col_grp_key}) |
| ); |
| """.format(**locals())) |
| |
| select_epsilon = ( |
| """ |
| ( |
| SELECT epsilon |
| FROM |
| {rel_epsilon} |
| WHERE |
| {rel_epsilon}.{col_grp_key} = {as_rel_source}.{col_grp_key} |
| ) |
| """ |
| .format(**locals())) |
| |
| return {'select_epsilon': select_epsilon, |
| 'epsilon': epsilon, |
| 'rel_epsilon': rel_epsilon, |
| 'as_rel_source': as_rel_source} |
| # ----------------------------------------------------------------------------- |
| |
| |
| def _extract_kernel_params(kernel_params='', n_features=10): |
| params_default = { |
| # common params |
| 'n_components': max(100, 2 * n_features), |
| 'fit_intercept': False, |
| 'random_state': 1, |
| |
| # polynomial params |
| 'degree': 3, |
| 'coef0': 1, |
| |
| # gaussian params |
| 'fit_in_memory': True, |
| 'gamma': 1 / n_features, |
| } |
| params_types = { |
| 'n_components': int, |
| 'fit_intercept': bool, |
| 'random_state': int, |
| 'degree': int, |
| 'coef0': float, |
| 'fit_in_memory': bool, |
| 'gamma': float, |
| } |
| return extract_keyvalue_params(kernel_params, params_types, params_default) |
| # ----------------------------------------------------------------------------- |
| |
| |
| def _extract_params(schema_madlib, params, module='SVM'): |
| # NOTICE: the type of values in params_default should be consistent with |
| # the types specified in params_types |
| params_default = { |
| 'init_stepsize': [0.01], |
| 'decay_factor': [0.9], |
| 'max_iter': [100], |
| 'tolerance': 1e-10, |
| 'lambda': [0.01], |
| 'norm': 'L2', |
| 'n_folds': 0, |
| 'validation_result': '', |
| 'epsilon': [0.01], |
| 'eps_table': '', |
| 'class_weight': ''} |
| |
| params_types = { |
| 'init_stepsize': list, |
| 'decay_factor': list, |
| 'max_iter': list, |
| 'tolerance': float, |
| 'lambda': list, |
| 'norm': str, |
| 'n_folds': int, |
| 'validation_result': str, |
| 'epsilon': list, |
| 'eps_table': str, |
| 'class_weight': str} |
| |
| params_vals = extract_keyvalue_params(params, params_types, params_default) |
| if params_vals['n_folds'] < 0: |
| plpy.error("{0} Error: n_folds must be non-negative!".format(module)) |
| |
| # validate lambda |
| params_vals['lambda'] = map(float, params_vals['lambda']) |
| _assert(all(lmd >= 0 for lmd in params_vals['lambda']), |
| "{0} Error: lambda must be non-negative!".format(module)) |
| # validate epsilon |
| params_vals['epsilon'] = map(float, params_vals['epsilon']) |
| _assert(all(e >= 0 for e in params_vals['epsilon']), |
| "{0} Error: epsilon must be non-negative!".format(module)) |
| # validating cross validation is delegated to _cross_validate_svm() |
| params_vals['init_stepsize'] = map(float, params_vals['init_stepsize']) |
| _assert(all(e > 0 for e in params_vals['init_stepsize']), |
| "{0} Error: init_stepsize must be positive!".format(module)) |
| params_vals['max_iter'] = map(int, params_vals['max_iter']) |
| _assert(all(e > 0 for e in params_vals['max_iter']), |
| "{0} Error: max_iter must be positive!".format(module)) |
| params_vals['decay_factor'] = map(float, params_vals['decay_factor']) |
| _assert(all(e <= 1 for e in params_vals['decay_factor']), |
| "{0} Error: decay_factor must be <= 1!".format(module)) |
| |
| if params_vals['validation_result']: |
| output_tbl_valid(params_vals['validation_result'], 'SVM') |
| |
| params_vals['norm'] = params_vals['norm'].lower() |
| _assert(params_vals['norm'] == 'l1' or params_vals['norm'] == 'l2', |
| "{0} Error: norm must be either L1 or L2!".format(module)) |
| _assert(params_vals['tolerance'] >= 0, |
| "{0} error: tolerance must be non-negative!".format(module)) |
| |
| params_vals['is_l2'] = True if params_vals['norm'] == 'l2' else False |
| return params_vals |
| # ------------------------------------------------------------------------- |
| |
| |
| import unittest |
| |
| |
| class SVMTestCase(unittest.TestCase): |
| """ |
| Comment "import plpy" and replace plpy.error calls with appropriate |
| Python Exceptions to successfully run the test cases |
| """ |
| def setUp(self): |
| self.optimizer_params1 = 'max_iter=10, optimizer="irls", precision=1e-4' |
| self.optimizer_params2 = 'max_iter=2.01, optimizer=newton-irls, precision=1e-5' |
| self.optimizer_params3 = 'max_iter=10, 10, optimizer=, lambda={1,2,3,4}' |
| self.optimizer_params4 = ('max_iter=10, optimizer="irls",' |
| 'precision=0.02.01, lambda={1,2,3,4}') |
| self.optimizer_types = {'max_iter': int, 'optimizer': str, |
| 'lambda': list, 'precision': float} |
| |
| def test_preprocess_optimizer(self): |
| self.assertEqual(preprocess_keyvalue_params(self.optimizer_params1), |
| ['max_iter=10', 'optimizer="irls"', 'precision=1e-4']) |
| self.assertEqual(preprocess_keyvalue_params(self.optimizer_params2), |
| ['max_iter=2.01', 'optimizer=newton-irls', 'precision=1e-5']) |
| self.assertEqual(preprocess_keyvalue_params(self.optimizer_params3), |
| ['max_iter=10', 'lambda={1,2,3,4}']) |
| self.assertEqual(preprocess_keyvalue_params(self.optimizer_params4), |
| ['max_iter=10', 'optimizer="irls"', 'precision=0.02', 'lambda={1,2,3,4}']) |
| |
| |
| if __name__ == '__main__': |
| unittest.main() |