blob: 2778a0c015d3c5b1d6b9e7bcf4afc1a587fdd9e3 [file] [log] [blame]
#!/usr/bin/env python
import plpy
# -----------------------------------------------
# Function to run the regression algorithm
# -----------------------------------------------
def sv_regression( input_table, modelname, parallel):
"""
Executes the support vector regression algorithm.
@param input_table Name of table/view containing the training data
@param model_name Name under which we want to store the learned model
@param parallel A flag indicating whether the system should learn multiple models in parallel.
"""
if (parallel) :
# Learning multiple models in parallel
# Output error if models with the same modelname already exist
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
plpy.execute(sql);
# Store the models learned
numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
numproc = numproc_t[0]['count'];
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
else :
# Learning a single model
# Output error if a model with the same modelname already exists
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ')';
plpy.execute(sql);
# Store the model learned
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');
return '''Finished support vector regression learning on %s table.
''' % (input_table)
# -----------------------------------------------
# Function to run the classification algorithm
# -----------------------------------------------
def sv_classification( input_table, modelname, parallel):
"""
Executes the support vector classification algorithm.
@param input_table Name of table/view containing the training data
@param model_name Name under which we want to store the learned model
@param parallel A flag indicating whether the system should learn multiple models in parallel.
"""
if (parallel) :
# Learning multiple models in parallel
# Output error if models with the same modelname already exist
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
plpy.execute(sql);
# Store the models learned
numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
numproc = numproc_t[0]['count'];
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
else :
# Learning a single model
# Output error if a model with the same modelname already exists
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ')';
plpy.execute(sql);
# Store the model learned
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');
return '''Finished support vector classification learning on %s table.
''' % (input_table)
# -----------------------------------------------
# Function to run the novelty detection algorithm
# -----------------------------------------------
def sv_novelty_detection( input_table, modelname, parallel):
"""
Executes the support vector novelty detection algorithm.
@param input_table Name of table/view containing the training data
@param model_name Name under which we want to store the learned model
@param parallel A flag indicating whether the system should learn multiple models in parallel.
"""
if (parallel) :
# Learning multiple models in parallel
# Output error if models with the same modelname already exist
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ' group by gp_segment_id)';
plpy.execute(sql);
# Store the models learned
numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
numproc = numproc_t[0]['count'];
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
else :
# Learning a single model
# Output error if a model with the same modelname already exists
sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
seen = plpy.execute(sql);
if (seen[0]['count'] > 0):
plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');
# Start learning process
sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ')';
plpy.execute(sql);
# Store the model learned
plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');
return '''Finished support vector novelty detection learning on %s table.
''' % (input_table)
# ---------------------------------------------------
# Function to predict the labels of points in a table
# ---------------------------------------------------
def sv_predict( input_table, col_name, modelname, output_table, parallel):
"""
Scores the data points stored in a table using a learned support vector model.
@param input_table Name of table/view containing the data points to be scored
@param col_name Name of column in input_table containing the data points
@param model_name Name under which we want to store the learned model
@param output_table Name of table to store the results
@param parallel A flag indicating whether the system should learn multiple models in parallel.
"""
plpy.execute('drop table if exists ' + output_table);
plpy.execute('create table ' + output_table + ' ( id int, prediction float8 )');
if (parallel) :
num_models_t = plpy.execute('SELECT COUNT(DISTINCT(id)) n FROM MADLIB_SCHEMA.sv_model WHERE position(\'' + modelname + '\' in id) > 0 AND \'' + modelname + '\' <> id;');
num_models = num_models_t[0]['n'];
sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) / ' + str(num_models) + ' from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where position(\'' + modelname + '\' in m.id) > 0 AND \'' + modelname + '\' <> m.id group by 1)';
plpy.execute(sql);
else :
sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where m.id = \'' + modelname + '\' group by 1)';
plpy.execute(sql);
return '''Finished processing data points in %s table; results are stored in %s table.
''' % (input_table,output_table)