| #!/usr/bin/env python |
| |
| import plpy |
| |
| # ----------------------------------------------- |
| # Function to run the regression algorithm |
| # ----------------------------------------------- |
| def sv_regression( input_table, modelname, parallel): |
| """ |
| Executes the support vector regression algorithm. |
| |
| @param input_table Name of table/view containing the training data |
| @param model_name Name under which we want to store the learned model |
| @param parallel A flag indicating whether the system should learn multiple models in parallel. |
| |
| """ |
| |
| if (parallel) : |
| # Learning multiple models in parallel |
| |
| # Output error if models with the same modelname already exist |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ' group by gp_segment_id)'; |
| plpy.execute(sql); |
| |
| # Store the models learned |
| numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table); |
| numproc = numproc_t[0]['count']; |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')'); |
| else : |
| # Learning a single model |
| |
| # Output error if a model with the same modelname already exists |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ')'; |
| plpy.execute(sql); |
| |
| # Store the model learned |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')'); |
| |
| return '''Finished support vector regression learning on %s table. |
| ''' % (input_table) |
| |
| # ----------------------------------------------- |
| # Function to run the classification algorithm |
| # ----------------------------------------------- |
| def sv_classification( input_table, modelname, parallel): |
| """ |
| Executes the support vector classification algorithm. |
| |
| @param input_table Name of table/view containing the training data |
| @param model_name Name under which we want to store the learned model |
| @param parallel A flag indicating whether the system should learn multiple models in parallel. |
| |
| """ |
| |
| if (parallel) : |
| # Learning multiple models in parallel |
| |
| # Output error if models with the same modelname already exist |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ' group by gp_segment_id)'; |
| plpy.execute(sql); |
| |
| # Store the models learned |
| numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table); |
| numproc = numproc_t[0]['count']; |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')'); |
| else : |
| # Learning a single model |
| |
| # Output error if a model with the same modelname already exists |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ')'; |
| plpy.execute(sql); |
| |
| # Store the model learned |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')'); |
| |
| return '''Finished support vector classification learning on %s table. |
| ''' % (input_table) |
| |
| |
| # ----------------------------------------------- |
| # Function to run the novelty detection algorithm |
| # ----------------------------------------------- |
| def sv_novelty_detection( input_table, modelname, parallel): |
| """ |
| Executes the support vector novelty detection algorithm. |
| |
| @param input_table Name of table/view containing the training data |
| @param model_name Name under which we want to store the learned model |
| @param parallel A flag indicating whether the system should learn multiple models in parallel. |
| |
| """ |
| |
| if (parallel) : |
| # Learning multiple models in parallel |
| |
| # Output error if models with the same modelname already exist |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ' group by gp_segment_id)'; |
| plpy.execute(sql); |
| |
| # Store the models learned |
| numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table); |
| numproc = numproc_t[0]['count']; |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')'); |
| else : |
| # Learning a single model |
| |
| # Output error if a model with the same modelname already exists |
| sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\''; |
| seen = plpy.execute(sql); |
| if (seen[0]['count'] > 0): |
| plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function'); |
| |
| # Start learning process |
| sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ')'; |
| plpy.execute(sql); |
| |
| # Store the model learned |
| plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')'); |
| |
| return '''Finished support vector novelty detection learning on %s table. |
| ''' % (input_table) |
| |
| |
| |
| # --------------------------------------------------- |
| # Function to predict the labels of points in a table |
| # --------------------------------------------------- |
| def sv_predict( input_table, col_name, modelname, output_table, parallel): |
| """ |
| Scores the data points stored in a table using a learned support vector model. |
| |
| @param input_table Name of table/view containing the data points to be scored |
| @param col_name Name of column in input_table containing the data points |
| @param model_name Name under which we want to store the learned model |
| @param output_table Name of table to store the results |
| @param parallel A flag indicating whether the system should learn multiple models in parallel. |
| |
| """ |
| |
| plpy.execute('drop table if exists ' + output_table); |
| plpy.execute('create table ' + output_table + ' ( id int, prediction float8 )'); |
| |
| if (parallel) : |
| num_models_t = plpy.execute('SELECT COUNT(DISTINCT(id)) n FROM MADLIB_SCHEMA.sv_model WHERE position(\'' + modelname + '\' in id) > 0 AND \'' + modelname + '\' <> id;'); |
| num_models = num_models_t[0]['n']; |
| |
| sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) / ' + str(num_models) + ' from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where position(\'' + modelname + '\' in m.id) > 0 AND \'' + modelname + '\' <> m.id group by 1)'; |
| plpy.execute(sql); |
| |
| else : |
| sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where m.id = \'' + modelname + '\' group by 1)'; |
| plpy.execute(sql); |
| |
| return '''Finished processing data points in %s table; results are stored in %s table. |
| ''' % (input_table,output_table) |
| |