src/ports/postgres/modules/kmeans/kmeans_auto.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 @file kmeans_auto.py_in

 @brief

 """

 import numpy as np
 import plpy
 import math
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import unique_string
 from utilities.validate_args import output_tbl_valid
 from utilities.validate_args import get_algorithm_name

 ELBOW = 'elbow'
 SILHOUETTE = 'silhouette'
 BOTH = 'both'

 RANDOM = 'random'
 PP = 'pp'

 def _validate(output_table, k):

     output_tbl_valid(output_table, "kmeans_auto")
     output_tbl_valid('{0}_summary'.format(output_table), "kmeans_auto")

     _assert(k, "kmeans_auto: k cannot be NULL.")
     _assert(len(k)>1, "kmeans_auto: Length of k array should be more than 1.")
     _assert(min(k)>1, "kmeans_auto: the minimum k value has to be > 1.")
     _assert(len(set(k)) == len(k), "kmeans_auto: Duplicate values are not allowed in k.")


 def set_defaults(schema_madlib, fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned, k_selection_algorithm, seeding, seeding_sample_ratio):

     fn_dist = (fn_dist if fn_dist else '{0}.squared_dist_norm2'.format(schema_madlib))
     agg_centroid = agg_centroid if agg_centroid \
                     else '{0}.avg'.format(schema_madlib)
     max_num_iterations = max_num_iterations if max_num_iterations \
                           else 20
     min_frac_reassigned = min_frac_reassigned if min_frac_reassigned \
                            else 0.001

     k_selection_algorithm = get_algorithm_name(k_selection_algorithm, SILHOUETTE,
         [ELBOW, SILHOUETTE, BOTH], 'kmeans_auto')

     if seeding is PP:
         seeding_sample_ratio = (seeding_sample_ratio
                                 if seeding_sample_ratio is not None else 1.0)
     return (fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
             k_selection_algorithm, seeding_sample_ratio)

 def kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
     fn_dist=None, agg_centroid=None, max_num_iterations=None,
     min_frac_reassigned=None, k_selection_algorithm=None, seeding=None,
     seeding_sample_ratio=None, **kwargs):

     with MinWarning("error"):
         _validate(output_table, k)

         (fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
          k_selection_algorithm, seeding_sample_ratio) = set_defaults(
             schema_madlib, fn_dist, agg_centroid, max_num_iterations,
             min_frac_reassigned, k_selection_algorithm, seeding,
             seeding_sample_ratio)

         silhouette_col = ""
         elbow_col = ""

         # If the selection is elbow or both, calculate elbow
         use_silhouette = k_selection_algorithm in [SILHOUETTE, BOTH]
         # If the selection is silhouette or both, calculate silhouette
         use_elbow = k_selection_algorithm in [ELBOW, BOTH]

         if use_silhouette:
             silhouette_col = ", {0} DOUBLE PRECISION".format(SILHOUETTE)
         if use_elbow:
             elbow_col = ", {0} DOUBLE PRECISION".format(ELBOW)

         plpy.execute("""
             CREATE TABLE {output_table} (
                 k INTEGER,
                 centroids   DOUBLE PRECISION[][],
                 cluster_variance    DOUBLE PRECISION[],
                 objective_fn    DOUBLE PRECISION,
                 frac_reassigned DOUBLE PRECISION,
                 num_iterations  INTEGER
                 {silhouette_col}
                 {elbow_col})
             """.format(**locals()))

         silhouette_vals = []

         for current_k in k:
             if seeding is 'random':
                 plpy.execute("""
                     INSERT INTO {output_table}
                     (k, centroids, cluster_variance, objective_fn, frac_reassigned,
                     num_iterations)
                     SELECT {current_k} as k, *
                     FROM {schema_madlib}.kmeans_random('{rel_source}',
                                          '{expr_point}',
                                          {current_k},
                                          '{fn_dist}',
                                          '{agg_centroid}',
                                          {max_num_iterations},
                                          {min_frac_reassigned});
                     """.format(**locals()))
             else:
                 plpy.execute("""
                     INSERT INTO {output_table}
                     (k, centroids, cluster_variance, objective_fn, frac_reassigned,
                     num_iterations)
                     SELECT {current_k} as k, *
                     FROM {schema_madlib}.kmeanspp('{rel_source}',
                                          '{expr_point}',
                                          {current_k},
                                          '{fn_dist}',
                                          '{agg_centroid}',
                                          {max_num_iterations},
                                          {min_frac_reassigned},
                                          {seeding_sample_ratio});
                     """.format(**locals()))

             if use_silhouette:
                 silhouette_query= """
                     SELECT * FROM {schema_madlib}.simple_silhouette(
                         '{rel_source}',
                         '{expr_point}',
                         (SELECT centroids
                          FROM {output_table}
                          WHERE k = {current_k}),
                         '{fn_dist}')
                     """.format(**locals())
                 new_silh = plpy.execute(silhouette_query)[0]['simple_silhouette']
                 if math.isinf(new_silh):
                     new_silh = "Infinity"
                 elif math.isnan(new_silh):
                     new_silh = "NaN"
                 silhouette_vals.append(new_silh)


         update_query = """
             UPDATE {output_table} SET {{column}} = __value__ FROM
             (SELECT unnest(ARRAY[{k_arr}]) AS __k__,
                     unnest(ARRAY[{{calc_arr}}]::DOUBLE PRECISION[]) AS __value__
             )sub_q
             WHERE __k__ = k
             """.format(output_table = output_table,
                        k_arr = str(k)[1:-1])
         if use_silhouette:
             optimal_sil =  k[np.argmax(np.array(silhouette_vals))]
             plpy.execute(update_query.format(column = SILHOUETTE,
                 calc_arr = str(silhouette_vals)[1:-1]))

         if use_elbow:
             optimal_elbow, second_order = _calculate_elbow(output_table)
             plpy.execute(update_query.format(column = ELBOW,
                 calc_arr = str(second_order)[1:-1]))

         optimal_k = optimal_sil if use_silhouette else optimal_elbow

         plpy.execute("""
             CREATE TABLE {output_table}_summary AS
             SELECT {output_table}.*,
                    '{algorithm}'::VARCHAR AS selection_algorithm
             FROM {output_table}
             WHERE k = {optimal_k}
             """.format(algorithm = SILHOUETTE if use_silhouette else ELBOW,
                        **locals()))

     return

 def _calculate_elbow(output_table):

     # We have to get the values in ordered fashion because the elbow is only defined for ordered values.
     inertia_result = plpy.execute("""
                  SELECT k, objective_fn FROM {output_table} ORDER BY k ASC
                  """.format(**locals()))
     k = [ i['k'] for i in inertia_result ]
     inertia_list = [ i['objective_fn'] for i in inertia_result ]
     inertia_list = np.array(inertia_list)

     first_order=np.gradient(inertia_list, k)
     second_order=np.gradient(first_order, k)
     index_with_elbow=k[np.argmax(second_order)]

     return index_with_elbow, second_order.tolist()

 def kmeans_random_auto(schema_madlib, rel_source, output_table, expr_point, k,
     fn_dist=None, agg_centroid=None, max_num_iterations=None,
     min_frac_reassigned=None, k_selection_algorithm=None, **kwargs):

     kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
     fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
     k_selection_algorithm, RANDOM)

     return

 def kmeanspp_auto(schema_madlib, rel_source, output_table, expr_point, k,
     fn_dist=None, agg_centroid=None, max_num_iterations=None,
     min_frac_reassigned=None, seeding_sample_ratio=None,
     k_selection_algorithm=None, **kwargs):

     kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
     fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
     k_selection_algorithm, PP, seeding_sample_ratio)

     return
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	@file kmeans_auto.py_in

	@brief

	"""

	import numpy as np
	import plpy
	import math
	from utilities.control import MinWarning
	from utilities.utilities import _assert
	from utilities.utilities import unique_string
	from utilities.validate_args import output_tbl_valid
	from utilities.validate_args import get_algorithm_name

	ELBOW = 'elbow'
	SILHOUETTE = 'silhouette'
	BOTH = 'both'

	RANDOM = 'random'
	PP = 'pp'

	def _validate(output_table, k):

	output_tbl_valid(output_table, "kmeans_auto")
	output_tbl_valid('{0}_summary'.format(output_table), "kmeans_auto")

	_assert(k, "kmeans_auto: k cannot be NULL.")
	_assert(len(k)>1, "kmeans_auto: Length of k array should be more than 1.")
	_assert(min(k)>1, "kmeans_auto: the minimum k value has to be > 1.")
	_assert(len(set(k)) == len(k), "kmeans_auto: Duplicate values are not allowed in k.")


	def set_defaults(schema_madlib, fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned, k_selection_algorithm, seeding, seeding_sample_ratio):

	fn_dist = (fn_dist if fn_dist else '{0}.squared_dist_norm2'.format(schema_madlib))
	agg_centroid = agg_centroid if agg_centroid \
	else '{0}.avg'.format(schema_madlib)
	max_num_iterations = max_num_iterations if max_num_iterations \
	else 20
	min_frac_reassigned = min_frac_reassigned if min_frac_reassigned \
	else 0.001

	k_selection_algorithm = get_algorithm_name(k_selection_algorithm, SILHOUETTE,
	[ELBOW, SILHOUETTE, BOTH], 'kmeans_auto')

	if seeding is PP:
	seeding_sample_ratio = (seeding_sample_ratio
	if seeding_sample_ratio is not None else 1.0)
	return (fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
	k_selection_algorithm, seeding_sample_ratio)

	def kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
	fn_dist=None, agg_centroid=None, max_num_iterations=None,
	min_frac_reassigned=None, k_selection_algorithm=None, seeding=None,
	seeding_sample_ratio=None, **kwargs):

	with MinWarning("error"):
	_validate(output_table, k)

	(fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
	k_selection_algorithm, seeding_sample_ratio) = set_defaults(
	schema_madlib, fn_dist, agg_centroid, max_num_iterations,
	min_frac_reassigned, k_selection_algorithm, seeding,
	seeding_sample_ratio)

	silhouette_col = ""
	elbow_col = ""

	# If the selection is elbow or both, calculate elbow
	use_silhouette = k_selection_algorithm in [SILHOUETTE, BOTH]
	# If the selection is silhouette or both, calculate silhouette
	use_elbow = k_selection_algorithm in [ELBOW, BOTH]

	if use_silhouette:
	silhouette_col = ", {0} DOUBLE PRECISION".format(SILHOUETTE)
	if use_elbow:
	elbow_col = ", {0} DOUBLE PRECISION".format(ELBOW)

	plpy.execute("""
	CREATE TABLE {output_table} (
	k INTEGER,
	centroids DOUBLE PRECISION[][],
	cluster_variance DOUBLE PRECISION[],
	objective_fn DOUBLE PRECISION,
	frac_reassigned DOUBLE PRECISION,
	num_iterations INTEGER
	{silhouette_col}
	{elbow_col})
	""".format(**locals()))

	silhouette_vals = []

	for current_k in k:
	if seeding is 'random':
	plpy.execute("""
	INSERT INTO {output_table}
	(k, centroids, cluster_variance, objective_fn, frac_reassigned,
	num_iterations)
	SELECT {current_k} as k, *
	FROM {schema_madlib}.kmeans_random('{rel_source}',
	'{expr_point}',
	{current_k},
	'{fn_dist}',
	'{agg_centroid}',
	{max_num_iterations},
	{min_frac_reassigned});
	""".format(**locals()))
	else:
	plpy.execute("""
	INSERT INTO {output_table}
	(k, centroids, cluster_variance, objective_fn, frac_reassigned,
	num_iterations)
	SELECT {current_k} as k, *
	FROM {schema_madlib}.kmeanspp('{rel_source}',
	'{expr_point}',
	{current_k},
	'{fn_dist}',
	'{agg_centroid}',
	{max_num_iterations},
	{min_frac_reassigned},
	{seeding_sample_ratio});
	""".format(**locals()))

	if use_silhouette:
	silhouette_query= """
	SELECT * FROM {schema_madlib}.simple_silhouette(
	'{rel_source}',
	'{expr_point}',
	(SELECT centroids
	FROM {output_table}
	WHERE k = {current_k}),
	'{fn_dist}')
	""".format(**locals())
	new_silh = plpy.execute(silhouette_query)[0]['simple_silhouette']
	if math.isinf(new_silh):
	new_silh = "Infinity"
	elif math.isnan(new_silh):
	new_silh = "NaN"
	silhouette_vals.append(new_silh)


	update_query = """
	UPDATE {output_table} SET {{column}} = __value__ FROM
	(SELECT unnest(ARRAY[{k_arr}]) AS __k__,
	unnest(ARRAY[{{calc_arr}}]::DOUBLE PRECISION[]) AS __value__
	)sub_q
	WHERE __k__ = k
	""".format(output_table = output_table,
	k_arr = str(k)[1:-1])
	if use_silhouette:
	optimal_sil = k[np.argmax(np.array(silhouette_vals))]
	plpy.execute(update_query.format(column = SILHOUETTE,
	calc_arr = str(silhouette_vals)[1:-1]))

	if use_elbow:
	optimal_elbow, second_order = _calculate_elbow(output_table)
	plpy.execute(update_query.format(column = ELBOW,
	calc_arr = str(second_order)[1:-1]))

	optimal_k = optimal_sil if use_silhouette else optimal_elbow

	plpy.execute("""
	CREATE TABLE {output_table}_summary AS
	SELECT {output_table}.*,
	'{algorithm}'::VARCHAR AS selection_algorithm
	FROM {output_table}
	WHERE k = {optimal_k}
	""".format(algorithm = SILHOUETTE if use_silhouette else ELBOW,
	**locals()))

	return

	def _calculate_elbow(output_table):

	# We have to get the values in ordered fashion because the elbow is only defined for ordered values.
	inertia_result = plpy.execute("""
	SELECT k, objective_fn FROM {output_table} ORDER BY k ASC
	""".format(**locals()))
	k = [ i['k'] for i in inertia_result ]
	inertia_list = [ i['objective_fn'] for i in inertia_result ]
	inertia_list = np.array(inertia_list)

	first_order=np.gradient(inertia_list, k)
	second_order=np.gradient(first_order, k)
	index_with_elbow=k[np.argmax(second_order)]

	return index_with_elbow, second_order.tolist()

	def kmeans_random_auto(schema_madlib, rel_source, output_table, expr_point, k,
	fn_dist=None, agg_centroid=None, max_num_iterations=None,
	min_frac_reassigned=None, k_selection_algorithm=None, **kwargs):

	kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
	fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
	k_selection_algorithm, RANDOM)

	return

	def kmeanspp_auto(schema_madlib, rel_source, output_table, expr_point, k,
	fn_dist=None, agg_centroid=None, max_num_iterations=None,
	min_frac_reassigned=None, seeding_sample_ratio=None,
	k_selection_algorithm=None, **kwargs):

	kmeans_auto(schema_madlib, rel_source, output_table, expr_point, k,
	fn_dist, agg_centroid, max_num_iterations, min_frac_reassigned,
	k_selection_algorithm, PP, seeding_sample_ratio)

	return