src/ports/postgres/modules/deep_learning/madlib_keras_gpu_info.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 @file madlib_keras_gpu_info.py_in

 @brief GPU configuration helper function

 @namespace madlib_keras_gpu_info
 """

 import os
 import subprocess

 import plpy
 from utilities.utilities import is_platform_pg
 from utilities.utilities import unique_string
 from utilities.validate_args import output_tbl_valid

 class OutputInfoSchema:
     TEMP_INFO_TABLE = unique_string(desp='gpu_info')
     SEG_ID_COL = 'gp_seg_id'
     GPU_DESCR_COL = 'gpu_descr'


 class Source:
     NVIDIA = 'nvidia'
     TENSORFLOW = 'tensorflow'


 class GPUInfoFunctions:
     @staticmethod
     def get_gpu_info_from_nvidia():
         """
         This function will run only on segment(s). Make sure not to run any non
         select plpy execute.
         :return: list of gpu descriptions as returned by nvidia-smi -L.
         """
         try:
             return subprocess.check_output(["nvidia-smi", "-L"]).splitlines()
         except Exception:  # Handle case when nvidia-smi -L fails
             return []

     @staticmethod
     def get_gpu_info_from_tensorflow():
         """
         This function will run only on segment(s). Make sure not to run any non
         select plpy execute.
         :return: list of gpu descriptions as returned by tensorflow
         """
         current_working_dir = os.path.dirname(os.path.realpath(__file__))
         gpus = subprocess.check_output(["python", "gpu_info_from_tf.py"],
                                        cwd=current_working_dir).splitlines()
         return gpus


 def gpu_configuration(schema_madlib, output_table, source):
     """
     :return: List of gpus along with their hostname in the format
         GPDB
         gpu_descr         |       hostname
         ------------------+--------------------------
         NVIDIA Tesla P100 | host1
         NVIDIA Tesla P100 | host1
         Super Duper GPU   | host2
         Super Duper GPU   | host2
         1. We use gp_dist_random to run either the nvidia smi UDF or the tensorflow UDF
             on all the hosts. (see gpu_info_nvidia/gpu_info_tensorflow)
         2. Also we do not need to run the tf/nvidia UDF on all the segments,
             just one segment per host. That's why we group the output of
             gp_segment_configuration by hostname and get the min segment from
             each host.
         3. To get the hostname along with the gpu description, we have to join
             the output of nvidia/tf UDF with gp_segment_configuration and filter
             out the following
             * master
             * mirror segments
             * empty/null gpu description
         Workflow for gpdb
         1. Run query to get min seg ids on each host. This is so that we can run
         the gpu UDF on just one segment per host.
         2. Create a table by running the tf/nvidia UDF on the segment ids returned
         from the previous step. Note that this table will only contain the output
         of the UDF and the segment id itself. This table does not contain hostnames
         3. To get the hostname associated with the segment id, we need to join the
         table created in step with gp_segment_configuration.
         It's important to note that we can merge all these 3 queries into one but
         the problem with that is that a redistribution happens before running the UDF
         which means the UDF does not run on the segments that we pass in to the query.
         To avoid this, we broke down the query into 3 parts so that the UDF is always
         run on the intended segments.

         POSTGRES
         gpu_descr         |       hostname
         ------------------+--------------------------
         NVIDIA Tesla P100 | localhost
     """
     module_name = 'madlib_keras_gpu_info'
     output_tbl_valid(output_table, module_name)

     if not source:
         source = Source.TENSORFLOW
     source = source.lower()
     if source != Source.TENSORFLOW and source != Source.NVIDIA:
         plpy.error("DL: source has to be one of {0} or {1}".format(
             Source.TENSORFLOW, Source.NVIDIA))

     gpu_fn_name = 'gpu_info_{0}'.format(source)
     if is_platform_pg():
         gpu_for_postgres(schema_madlib, output_table, gpu_fn_name)
     else:
         gpu_for_gpdb(schema_madlib, output_table, gpu_fn_name)


 def gpu_for_postgres(schema_madlib, output_table, gpu_fn_name):
     gpu_info_query = """
     CREATE TABLE {0} AS
     SELECT 'localhost' as hostname, {1} from (SELECT unnest({2}.{3}()) AS {1}) s1
     where {1} is NOT NULL AND {1} != ''
         """.format(output_table, OutputInfoSchema.GPU_DESCR_COL,
                    schema_madlib, gpu_fn_name)
     plpy.execute(gpu_info_query)


 def gpu_for_gpdb(schema_madlib, output_table, gpu_fn_name):
     min_seg_on_each_host = get_min_seg_ids_on_each_host()

     create_gpu_info_table_without_hostname(schema_madlib, gpu_fn_name,
                                            min_seg_on_each_host)

     create_gpu_info_table_with_hostname(output_table)

     plpy.execute("DROP TABLE IF EXISTS {0}".format(OutputInfoSchema.TEMP_INFO_TABLE))


 def get_min_seg_ids_on_each_host():
     """
     Run query to get min seg ids on each host. This is so that we can run
     the gpu UDF on just one segment per host.
     :return: List of min seg id per host
     """
     min_seg_id_alias = 'min_seg_id'
     min_seg_query = """
     SELECT {min_seg_id_alias} FROM
     (select hostname, min(content) AS {min_seg_id_alias}
     FROM gp_segment_configuration WHERE content != -1 AND role='p'
     GROUP BY hostname) min_seg_id_subquery
     """.format(**locals())
     min_seg_on_each_host = plpy.execute(min_seg_query)

     min_seg_on_each_host = ','.join([str(seg[min_seg_id_alias])
                                      for seg in min_seg_on_each_host])
     return min_seg_on_each_host


 def create_gpu_info_table_without_hostname(schema_madlib, gpu_fn_name,
                                            min_seg_on_each_host):
     """ output_table,
     Create a table by running the tf/nvidia UDF on the segment ids returned
     from the previous step. Note that this table will only contain the output
     of the UDF and the segment id itself. This table does not contain hostnames
     :param schema_madlib:
     :param gpu_fn_name:
     :param min_seg_on_each_host:
     """
     gpu_info_per_host_query = """
     CREATE TABLE {0} AS SELECT gp_segment_id AS {1}, {2}.{3}()
     AS {4} FROM gp_dist_random('gp_id') WHERE gp_segment_id IN ({5})
     """.format(OutputInfoSchema.TEMP_INFO_TABLE,
                OutputInfoSchema.SEG_ID_COL,
                schema_madlib, gpu_fn_name,
                OutputInfoSchema.GPU_DESCR_COL,
                min_seg_on_each_host)
     plpy.execute(gpu_info_per_host_query)


 def create_gpu_info_table_with_hostname(output_table):
     """
     Create the final output table that contains the hostname and the gpu description.
     To create this table, we need to join the table created in
     create_gpu_info_table_without_hostname with gp_segment_configuration.
     """
     final_join_query = """
     CREATE TABLE {0} AS
     SELECT hostname, {1} FROM
     (
     SELECT hostname, unnest({1}) AS {1} FROM {2}
     JOIN
     gp_segment_configuration ON {3}=content WHERE content != -1 AND role='p'
     ) s1
     WHERE {1} != '' AND {1} is NOT NULL ORDER BY 1,2;
     """.format(output_table, OutputInfoSchema.GPU_DESCR_COL,
                OutputInfoSchema.TEMP_INFO_TABLE,
                OutputInfoSchema.SEG_ID_COL)
     plpy.execute(final_join_query)


 def gpu_configuration_help(schema_madlib):
     """
         Help function for gpu configuration

         Args:
             @param schema_madlib

         Returns:
             String. Help/usage information
         """

     help_string = """
     Utility function to report number and type of GPUs on the database cluster.
     -----------------------------------------------------------------------
                                 USAGE
     -----------------------------------------------------------------------
      SELECT {schema_madlib}.gpu_configuration(
         output_table,            -- Name of the output table to write out the
                                     GPU information.
         source                   -- Default: 'tensorflow'. Source for determining
                                     GPU configuration.
                                     Using 'tensorflow' returns a description based
                                     on what TensorFlow reports.
                                     Using 'nvidia' returns a description based
                                     on what the Nvidia Systems Management Interface
                                     (nvidia-smi) reports [1].
                                     Note that MADlib and Keras will use the TensorFlow
                                     information; the lower level nvidia-smi info
                                     is provided for convenience.
         )
     );

     -----------------------------------------------------------------------
                                 OUTPUT
     -----------------------------------------------------------------------
     The output table ('output_table' above) contains the following columns:

     hostname:   Name of the host machine in the cluster.
                 Does not include master or mirrors.  For PostgreSQL this will
                 always return 'localhost'.
     gpu_descr:  String reported by TensorFlow or nvidia-smi.
     """

     return help_string.format(schema_madlib=schema_madlib)
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	@file madlib_keras_gpu_info.py_in

	@brief GPU configuration helper function

	@namespace madlib_keras_gpu_info
	"""

	import os
	import subprocess

	import plpy
	from utilities.utilities import is_platform_pg
	from utilities.utilities import unique_string
	from utilities.validate_args import output_tbl_valid

	class OutputInfoSchema:
	TEMP_INFO_TABLE = unique_string(desp='gpu_info')
	SEG_ID_COL = 'gp_seg_id'
	GPU_DESCR_COL = 'gpu_descr'


	class Source:
	NVIDIA = 'nvidia'
	TENSORFLOW = 'tensorflow'


	class GPUInfoFunctions:
	@staticmethod
	def get_gpu_info_from_nvidia():
	"""
	This function will run only on segment(s). Make sure not to run any non
	select plpy execute.
	:return: list of gpu descriptions as returned by nvidia-smi -L.
	"""
	try:
	return subprocess.check_output(["nvidia-smi", "-L"]).splitlines()
	except Exception: # Handle case when nvidia-smi -L fails
	return []

	@staticmethod
	def get_gpu_info_from_tensorflow():
	"""
	This function will run only on segment(s). Make sure not to run any non
	select plpy execute.
	:return: list of gpu descriptions as returned by tensorflow
	"""
	current_working_dir = os.path.dirname(os.path.realpath(__file__))
	gpus = subprocess.check_output(["python", "gpu_info_from_tf.py"],
	cwd=current_working_dir).splitlines()
	return gpus


	def gpu_configuration(schema_madlib, output_table, source):
	"""
	:return: List of gpus along with their hostname in the format
	GPDB
	gpu_descr \| hostname
	------------------+--------------------------
	NVIDIA Tesla P100 \| host1
	NVIDIA Tesla P100 \| host1
	Super Duper GPU \| host2
	Super Duper GPU \| host2
	1. We use gp_dist_random to run either the nvidia smi UDF or the tensorflow UDF
	on all the hosts. (see gpu_info_nvidia/gpu_info_tensorflow)
	2. Also we do not need to run the tf/nvidia UDF on all the segments,
	just one segment per host. That's why we group the output of
	gp_segment_configuration by hostname and get the min segment from
	each host.
	3. To get the hostname along with the gpu description, we have to join
	the output of nvidia/tf UDF with gp_segment_configuration and filter
	out the following
	* master
	* mirror segments
	* empty/null gpu description
	Workflow for gpdb
	1. Run query to get min seg ids on each host. This is so that we can run
	the gpu UDF on just one segment per host.
	2. Create a table by running the tf/nvidia UDF on the segment ids returned
	from the previous step. Note that this table will only contain the output
	of the UDF and the segment id itself. This table does not contain hostnames
	3. To get the hostname associated with the segment id, we need to join the
	table created in step with gp_segment_configuration.
	It's important to note that we can merge all these 3 queries into one but
	the problem with that is that a redistribution happens before running the UDF
	which means the UDF does not run on the segments that we pass in to the query.
	To avoid this, we broke down the query into 3 parts so that the UDF is always
	run on the intended segments.

	POSTGRES
	gpu_descr \| hostname
	------------------+--------------------------
	NVIDIA Tesla P100 \| localhost
	"""
	module_name = 'madlib_keras_gpu_info'
	output_tbl_valid(output_table, module_name)

	if not source:
	source = Source.TENSORFLOW
	source = source.lower()
	if source != Source.TENSORFLOW and source != Source.NVIDIA:
	plpy.error("DL: source has to be one of {0} or {1}".format(
	Source.TENSORFLOW, Source.NVIDIA))

	gpu_fn_name = 'gpu_info_{0}'.format(source)
	if is_platform_pg():
	gpu_for_postgres(schema_madlib, output_table, gpu_fn_name)
	else:
	gpu_for_gpdb(schema_madlib, output_table, gpu_fn_name)


	def gpu_for_postgres(schema_madlib, output_table, gpu_fn_name):
	gpu_info_query = """
	CREATE TABLE {0} AS
	SELECT 'localhost' as hostname, {1} from (SELECT unnest({2}.{3}()) AS {1}) s1
	where {1} is NOT NULL AND {1} != ''
	""".format(output_table, OutputInfoSchema.GPU_DESCR_COL,
	schema_madlib, gpu_fn_name)
	plpy.execute(gpu_info_query)


	def gpu_for_gpdb(schema_madlib, output_table, gpu_fn_name):
	min_seg_on_each_host = get_min_seg_ids_on_each_host()

	create_gpu_info_table_without_hostname(schema_madlib, gpu_fn_name,
	min_seg_on_each_host)

	create_gpu_info_table_with_hostname(output_table)

	plpy.execute("DROP TABLE IF EXISTS {0}".format(OutputInfoSchema.TEMP_INFO_TABLE))


	def get_min_seg_ids_on_each_host():
	"""
	Run query to get min seg ids on each host. This is so that we can run
	the gpu UDF on just one segment per host.
	:return: List of min seg id per host
	"""
	min_seg_id_alias = 'min_seg_id'
	min_seg_query = """
	SELECT {min_seg_id_alias} FROM
	(select hostname, min(content) AS {min_seg_id_alias}
	FROM gp_segment_configuration WHERE content != -1 AND role='p'
	GROUP BY hostname) min_seg_id_subquery
	""".format(**locals())
	min_seg_on_each_host = plpy.execute(min_seg_query)

	min_seg_on_each_host = ','.join([str(seg[min_seg_id_alias])
	for seg in min_seg_on_each_host])
	return min_seg_on_each_host


	def create_gpu_info_table_without_hostname(schema_madlib, gpu_fn_name,
	min_seg_on_each_host):
	""" output_table,
	Create a table by running the tf/nvidia UDF on the segment ids returned
	from the previous step. Note that this table will only contain the output
	of the UDF and the segment id itself. This table does not contain hostnames
	:param schema_madlib:
	:param gpu_fn_name:
	:param min_seg_on_each_host:
	"""
	gpu_info_per_host_query = """
	CREATE TABLE {0} AS SELECT gp_segment_id AS {1}, {2}.{3}()
	AS {4} FROM gp_dist_random('gp_id') WHERE gp_segment_id IN ({5})
	""".format(OutputInfoSchema.TEMP_INFO_TABLE,
	OutputInfoSchema.SEG_ID_COL,
	schema_madlib, gpu_fn_name,
	OutputInfoSchema.GPU_DESCR_COL,
	min_seg_on_each_host)
	plpy.execute(gpu_info_per_host_query)


	def create_gpu_info_table_with_hostname(output_table):
	"""
	Create the final output table that contains the hostname and the gpu description.
	To create this table, we need to join the table created in
	create_gpu_info_table_without_hostname with gp_segment_configuration.
	"""
	final_join_query = """
	CREATE TABLE {0} AS
	SELECT hostname, {1} FROM
	(
	SELECT hostname, unnest({1}) AS {1} FROM {2}
	JOIN
	gp_segment_configuration ON {3}=content WHERE content != -1 AND role='p'
	) s1
	WHERE {1} != '' AND {1} is NOT NULL ORDER BY 1,2;
	""".format(output_table, OutputInfoSchema.GPU_DESCR_COL,
	OutputInfoSchema.TEMP_INFO_TABLE,
	OutputInfoSchema.SEG_ID_COL)
	plpy.execute(final_join_query)


	def gpu_configuration_help(schema_madlib):
	"""
	Help function for gpu configuration

	Args:
	@param schema_madlib

	Returns:
	String. Help/usage information
	"""

	help_string = """
	Utility function to report number and type of GPUs on the database cluster.
	-----------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.gpu_configuration(
	output_table, -- Name of the output table to write out the
	GPU information.
	source -- Default: 'tensorflow'. Source for determining
	GPU configuration.
	Using 'tensorflow' returns a description based
	on what TensorFlow reports.
	Using 'nvidia' returns a description based
	on what the Nvidia Systems Management Interface
	(nvidia-smi) reports [1].
	Note that MADlib and Keras will use the TensorFlow
	information; the lower level nvidia-smi info
	is provided for convenience.
	)
	);

	-----------------------------------------------------------------------
	OUTPUT
	-----------------------------------------------------------------------
	The output table ('output_table' above) contains the following columns:

	hostname: Name of the host machine in the cluster.
	Does not include master or mirrors. For PostgreSQL this will
	always return 'localhost'.
	gpu_descr: String reported by TensorFlow or nvidia-smi.
	"""

	return help_string.format(schema_madlib=schema_madlib)