methods/kernel-machines/src/pg_gp/online_sv.py_in - madlib - Git at Google

 #!/usr/bin/env python

 import plpy

 # -----------------------------------------------
 # Function to run the regression algorithm
 # -----------------------------------------------
 def sv_regression( input_table, modelname, parallel):
     """
     Executes the support vector regression algorithm.

     @param input_table Name of table/view containing the training data
     @param model_name Name under which we want to store the learned model
     @param parallel A flag indicating whether the system should learn multiple models in parallel.

     """

     if (parallel) :
         # Learning multiple models in parallel

         # Output error if models with the same modelname already exist
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
         plpy.execute(sql);

         # Store the models learned
         numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
         numproc = numproc_t[0]['count'];
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
     else :
         # Learning a single model

         # Output error if a model with the same modelname already exists
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ')';
         plpy.execute(sql);

         # Store the model learned
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

     return '''Finished support vector regression learning on %s table.
            ''' % (input_table)

 # -----------------------------------------------
 # Function to run the classification algorithm
 # -----------------------------------------------
 def sv_classification( input_table, modelname, parallel):
     """
     Executes the support vector classification algorithm.

     @param input_table Name of table/view containing the training data
     @param model_name Name under which we want to store the learned model
     @param parallel A flag indicating whether the system should learn multiple models in parallel.

     """

     if (parallel) :
         # Learning multiple models in parallel

         # Output error if models with the same modelname already exist
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
         plpy.execute(sql);

         # Store the models learned
         numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
         numproc = numproc_t[0]['count'];
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
     else :
         # Learning a single model

         # Output error if a model with the same modelname already exists
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ')';
         plpy.execute(sql);

         # Store the model learned
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

     return '''Finished support vector classification learning on %s table.
            ''' % (input_table)


 # -----------------------------------------------
 # Function to run the novelty detection algorithm
 # -----------------------------------------------
 def sv_novelty_detection( input_table, modelname, parallel):
     """
     Executes the support vector novelty detection algorithm.

     @param input_table Name of table/view containing the training data
     @param model_name Name under which we want to store the learned model
     @param parallel A flag indicating whether the system should learn multiple models in parallel.

     """

     if (parallel) :
         # Learning multiple models in parallel

         # Output error if models with the same modelname already exist
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' || gp_segment_id, MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ' group by gp_segment_id)';
         plpy.execute(sql);

         # Store the models learned
         numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
         numproc = numproc_t[0]['count'];
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
     else :
         # Learning a single model

         # Output error if a model with the same modelname already exists
         sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
         seen = plpy.execute(sql);
         if (seen[0]['count'] > 0):
             plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

         # Start learning process
         sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ')';
         plpy.execute(sql);

         # Store the model learned
         plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

     return '''Finished support vector novelty detection learning on %s table.
            ''' % (input_table)


 # ---------------------------------------------------
 # Function to predict the labels of points in a table
 # ---------------------------------------------------
 def sv_predict( input_table, col_name, modelname, output_table, parallel):
     """
     Scores the data points stored in a table using a learned support vector model.

     @param input_table Name of table/view containing the data points to be scored
     @param col_name Name of column in input_table containing the data points
     @param model_name Name under which we want to store the learned model
     @param output_table Name of table to store the results
     @param parallel A flag indicating whether the system should learn multiple models in parallel.

     """

     plpy.execute('drop table if exists ' + output_table);
     plpy.execute('create table ' + output_table + ' ( id int, prediction float8 )');

     if (parallel) :
         num_models_t = plpy.execute('SELECT COUNT(DISTINCT(id)) n FROM MADLIB_SCHEMA.sv_model WHERE position(\'' + modelname + '\' in id) > 0 AND \'' + modelname + '\' <> id;');
         num_models = num_models_t[0]['n'];

         sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) / ' + str(num_models) + ' from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where position(\'' + modelname + '\' in m.id) > 0 AND \'' + modelname + '\' <> m.id group by 1)';
         plpy.execute(sql);

     else :
         sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where m.id = \'' + modelname + '\' group by 1)';
         plpy.execute(sql);

     return '''Finished processing data points in %s table; results are stored in %s table.
            ''' % (input_table,output_table)
	#!/usr/bin/env python

	import plpy

	# -----------------------------------------------
	# Function to run the regression algorithm
	# -----------------------------------------------
	def sv_regression( input_table, modelname, parallel):
	"""
	Executes the support vector regression algorithm.

	@param input_table Name of table/view containing the training data
	@param model_name Name under which we want to store the learned model
	@param parallel A flag indicating whether the system should learn multiple models in parallel.

	"""

	if (parallel) :
	# Learning multiple models in parallel

	# Output error if models with the same modelname already exist
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' \|\| gp_segment_id, MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
	plpy.execute(sql);

	# Store the models learned
	numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
	numproc = numproc_t[0]['count'];
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
	else :
	# Learning a single model

	# Output error if a model with the same modelname already exists
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_reg_agg(ind, label) from ' + input_table + ')';
	plpy.execute(sql);

	# Store the model learned
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

	return '''Finished support vector regression learning on %s table.
	''' % (input_table)

	# -----------------------------------------------
	# Function to run the classification algorithm
	# -----------------------------------------------
	def sv_classification( input_table, modelname, parallel):
	"""
	Executes the support vector classification algorithm.

	@param input_table Name of table/view containing the training data
	@param model_name Name under which we want to store the learned model
	@param parallel A flag indicating whether the system should learn multiple models in parallel.

	"""

	if (parallel) :
	# Learning multiple models in parallel

	# Output error if models with the same modelname already exist
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' \|\| gp_segment_id, MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ' group by gp_segment_id)';
	plpy.execute(sql);

	# Store the models learned
	numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
	numproc = numproc_t[0]['count'];
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
	else :
	# Learning a single model

	# Output error if a model with the same modelname already exists
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_cl_agg(ind, label) from ' + input_table + ')';
	plpy.execute(sql);

	# Store the model learned
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

	return '''Finished support vector classification learning on %s table.
	''' % (input_table)


	# -----------------------------------------------
	# Function to run the novelty detection algorithm
	# -----------------------------------------------
	def sv_novelty_detection( input_table, modelname, parallel):
	"""
	Executes the support vector novelty detection algorithm.

	@param input_table Name of table/view containing the training data
	@param model_name Name under which we want to store the learned model
	@param parallel A flag indicating whether the system should learn multiple models in parallel.

	"""

	if (parallel) :
	# Learning multiple models in parallel

	# Output error if models with the same modelname already exist
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '0\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\' \|\| gp_segment_id, MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ' group by gp_segment_id)';
	plpy.execute(sql);

	# Store the models learned
	numproc_t = plpy.execute('select count(distinct(gp_segment_id)) from ' + input_table);
	numproc = numproc_t[0]['count'];
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\', ' + str(numproc) + ')');
	else :
	# Learning a single model

	# Output error if a model with the same modelname already exists
	sql = 'select count(*) from MADLIB_SCHEMA.sv_results where id = \'' + modelname + '\'';
	seen = plpy.execute(sql);
	if (seen[0]['count'] > 0):
	plpy.error('model with name \'' + modelname + '\' already exists; please use a different model name or drop the model using drop_sv_model() function');

	# Start learning process
	sql = 'insert into MADLIB_SCHEMA.sv_results (select \'' + modelname + '\', MADLIB_SCHEMA.online_sv_nd_agg(ind) from ' + input_table + ')';
	plpy.execute(sql);

	# Store the model learned
	plpy.execute('select MADLIB_SCHEMA.storeModel(\'' + modelname + '\')');

	return '''Finished support vector novelty detection learning on %s table.
	''' % (input_table)



	# ---------------------------------------------------
	# Function to predict the labels of points in a table
	# ---------------------------------------------------
	def sv_predict( input_table, col_name, modelname, output_table, parallel):
	"""
	Scores the data points stored in a table using a learned support vector model.

	@param input_table Name of table/view containing the data points to be scored
	@param col_name Name of column in input_table containing the data points
	@param model_name Name under which we want to store the learned model
	@param output_table Name of table to store the results
	@param parallel A flag indicating whether the system should learn multiple models in parallel.

	"""

	plpy.execute('drop table if exists ' + output_table);
	plpy.execute('create table ' + output_table + ' ( id int, prediction float8 )');

	if (parallel) :
	num_models_t = plpy.execute('SELECT COUNT(DISTINCT(id)) n FROM MADLIB_SCHEMA.sv_model WHERE position(\'' + modelname + '\' in id) > 0 AND \'' + modelname + '\' <> id;');
	num_models = num_models_t[0]['n'];

	sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) / ' + str(num_models) + ' from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where position(\'' + modelname + '\' in m.id) > 0 AND \'' + modelname + '\' <> m.id group by 1)';
	plpy.execute(sql);

	else :
	sql = 'insert into ' + output_table + '(select t.id, sum(weight * MADLIB_SCHEMA.kernel(m.sv, t.' + col_name + ')) from MADLIB_SCHEMA.sv_model m, ' + input_table + ' t where m.id = \'' + modelname + '\' group by 1)';
	plpy.execute(sql);

	return '''Finished processing data points in %s table; results are stored in %s table.
	''' % (input_table,output_table)