blob: d2418e43da42eb241608c038e77d868860675bbe [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file madlib_keras_gpu_info.sql_in
*
* @brief SQL functions for GPU configuration.
* @date Nov 2019
*
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_gpu_configuration
@brief Utility function to report number and type of GPUs in the database cluster.
\warning <em> This MADlib method is still in early stage development.
Interface and implementation are subject to change. </em>
<div class="toc"><b>Contents</b><ul>
<li class="level1"><a href="#get_gpu_config">GPU Configuration</a></li>
<li class="level1"><a href="#example">Examples</a></li>
<li class="level1"><a href="#references">References</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul></div>
This utility function reports the number and type of GPUs
attached to hosts on the database cluster.
This can be useful when determining which segments to use for
training deep neural nets. For example, for economic reasons
you may wish to set up a heterogeneous clusters with GPUs only
on some of the hosts, not all of them. This utility
can help you identify where the GPUS are and direct the compute
to those locations only in subsequent training and evaluation steps.
@anchor get_gpu_config
@par GPU Confuguration
<pre class="syntax">
gpu_configuration( output_table,
source
)
</pre>
\b Arguments
<dl class="arglist">
<dt>output_table</dt>
<dd>TEXT. Name of the output table to write out the GPU information.
</dd>
<dt>source (optional)</dt>
<dd>TEXT, default: 'tensorflow'. Source for determining GPU configuration.
Using 'tensorflow' returns a description based on what TensorFlow reports.
Using 'nvidia' returns a description based on what the Nvidia Systems
Management Interface (nvidia-smi) reports [1]. Note that MADlib and Keras will use the
TensorFlow information; the lower level nvidia-smi info is provided for convenience.
@note
If the command 'nvidia-smi -L' returns an error, we do not pass through
the error message, but instead will show no GPUs for that host in the output table.
You may want to run nvidia-smi from the command line to see error
and informational messages.
</dd>
</dl>
<b>Output</b>
<br>
The output table contains the following:
<table class="output">
<tr>
<th>hostname</th>
<td>TEXT. Name of the host machine in the cluster.
Does not include master or mirrors. For PostgreSQL
this will always return 'localhost'.
</td>
</tr>
<tr>
<th>gpu_descr</th>
<td>TEXT. String reported by TensorFlow or nvidia-smi.
The formats are different and shown in the examples below.
</td>
</tr>
</table>
</br>
@anchor example
@par Examples
-# Get GPU configuration as per TensorFlow:
<pre class="example">
DROP TABLE IF EXISTS host_gpu_mapping_tf;
SELECT * FROM madlib.gpu_configuration('host_gpu_mapping_tf');
SELECT * FROM host_gpu_mapping_tf ORDER BY hostname, gpu_descr;
</pre>
<pre class="result">
hostname | gpu_descr
----------+------------------------------------------------------------------------------------------
phoenix0 | device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
phoenix0 | device: 1, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:05.0, compute capability: 6.0
phoenix0 | device: 2, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:06.0, compute capability: 6.0
phoenix0 | device: 3, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:07.0, compute capability: 6.0
phoenix1 | device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
phoenix1 | device: 1, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:05.0, compute capability: 6.0
phoenix3 | device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
phoenix3 | device: 1, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:05.0, compute capability: 6.0
phoenix3 | device: 2, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:06.0, compute capability: 6.0
phoenix3 | device: 3, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:07.0, compute capability: 6.0
phoenix4 | device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
phoenix4 | device: 1, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:05.0, compute capability: 6.0
phoenix4 | device: 2, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:06.0, compute capability: 6.0
phoenix4 | device: 3, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:07.0, compute capability: 6.0
(14 rows)
</pre>
In this heterogeneous cluster there are 4 GPUs attached to hosts 0, 3 and 4. There are 2 GPUs
attached to host 1 and no GPUs attached to host 2.
-# Get GPU configuration as per nvidia-smi:
<pre class="example">
DROP TABLE IF EXISTS host_gpu_mapping_nvidia;
SELECT * FROM madlib.gpu_configuration('host_gpu_mapping_nvidia', -- output table
'nvidia' -- source for GPU info
);
SELECT * FROM host_gpu_mapping_nvidia ORDER BY hostname, gpu_descr;
</pre>
<pre class="result">
hostname | gpu_descr
----------+------------------------------------------------------------------------------
phoenix0 | GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-f2ccc77e-2501-f6ee-4754-069dda256fb2)
phoenix0 | GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-b1fc40ca-c7c6-bc86-f20f-6e9a62cda3f8)
phoenix0 | GPU 2: Tesla P100-PCIE-16GB (UUID: GPU-d93bb21b-96f9-7c1d-3bab-cdd92b7bbc9d)
phoenix0 | GPU 3: Tesla P100-PCIE-16GB (UUID: GPU-2d79c4a8-479e-2f33-39f8-3ba80b63f830)
phoenix1 | GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-0af6bb1e-5b5b-4988-ad3a-a917e9584702)
phoenix1 | GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-d824c976-a8aa-ef26-a13c-9a9a7fe86bfd)
phoenix3 | GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-3681d0b6-1ec6-0453-fd81-29d88e549cd9)
phoenix3 | GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-d4b1f2e7-b238-ac9a-bbfe-918adeb69472)
phoenix3 | GPU 2: Tesla P100-PCIE-16GB (UUID: GPU-42a32ef1-a60c-e599-c8cf-0e669111ab6f)
phoenix3 | GPU 3: Tesla P100-PCIE-16GB (UUID: GPU-1cce09c4-6856-8031-be0b-8e8bbf9a10f3)
phoenix4 | GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-a71bdc18-fdd5-ba25-617e-19b23cc8e827)
phoenix4 | GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-f9d13688-7fe6-a029-24d1-985a5659f18f)
phoenix4 | GPU 2: Tesla P100-PCIE-16GB (UUID: GPU-06a7f54b-c07a-e87a-20d6-09bd99b19531)
phoenix4 | GPU 3: Tesla P100-PCIE-16GB (UUID: GPU-af3b32f3-8bd9-cb75-a8fb-25253b9da926)
(14 rows)
</pre>
-# To get a fuller picture at the segment level, combine with the Greenplum catalog
table 'gp_segment_configuration' which contains information about
segment instance configuration [2]. Here is an example of this table
filtering out master and mirrors:
<pre class="example">
SELECT * FROM gp_segment_configuration WHERE role='p' AND content>=0 ORDER BY hostname, dbid;
</pre>
<pre class="result">
dbid | content | role | preferred_role | mode | status | port | hostname | address | replication_port
------+---------+------+----------------+------+--------+-------+----------+----------+------------------
2 | 0 | p | p | c | u | 40000 | phoenix0 | phoenix0 | 70000
3 | 1 | p | p | c | u | 40001 | phoenix0 | phoenix0 | 70001
4 | 2 | p | p | c | u | 40002 | phoenix0 | phoenix0 | 70002
5 | 3 | p | p | c | u | 40003 | phoenix0 | phoenix0 | 70003
6 | 4 | p | p | c | u | 40000 | phoenix1 | phoenix1 | 70000
7 | 5 | p | p | c | u | 40001 | phoenix1 | phoenix1 | 70001
8 | 6 | p | p | c | u | 40002 | phoenix1 | phoenix1 | 70002
9 | 7 | p | p | c | u | 40003 | phoenix1 | phoenix1 | 70003
10 | 8 | p | p | c | u | 40000 | phoenix2 | phoenix2 | 70000
11 | 9 | p | p | c | u | 40001 | phoenix2 | phoenix2 | 70001
12 | 10 | p | p | c | u | 40002 | phoenix2 | phoenix2 | 70002
13 | 11 | p | p | c | u | 40003 | phoenix2 | phoenix2 | 70003
14 | 12 | p | p | c | u | 40000 | phoenix3 | phoenix3 | 70000
15 | 13 | p | p | c | u | 40001 | phoenix3 | phoenix3 | 70001
16 | 14 | p | p | c | u | 40002 | phoenix3 | phoenix3 | 70002
17 | 15 | p | p | c | u | 40003 | phoenix3 | phoenix3 | 70003
18 | 16 | p | p | c | u | 40000 | phoenix4 | phoenix4 | 70000
19 | 17 | p | p | c | u | 40001 | phoenix4 | phoenix4 | 70001
20 | 18 | p | p | c | u | 40002 | phoenix4 | phoenix4 | 70002
21 | 19 | p | p | c | u | 40003 | phoenix4 | phoenix4 | 70003
(20 rows)
</pre>
Now join this table with the GPU resources table to create a table containing a
list of all segments on hosts with GPUs attached:
<pre class="example">
DROP TABLE IF EXISTS segments_to_use;
CREATE TABLE segments_to_use AS
SELECT DISTINCT dbid, hostname FROM gp_segment_configuration JOIN host_gpu_mapping_tf USING (hostname)
WHERE role='p' AND content>=0;
SELECT * FROM segments_to_use ORDER BY hostname, dbid;
</pre>
<pre class="result">
dbid | hostname
------+----------
2 | phoenix0
3 | phoenix0
4 | phoenix0
5 | phoenix0
6 | phoenix1
7 | phoenix1
8 | phoenix1
9 | phoenix1
14 | phoenix3
15 | phoenix3
16 | phoenix3
17 | phoenix3
18 | phoenix4
19 | phoenix4
20 | phoenix4
21 | phoenix4
(16 rows)
</pre>
@anchor references
@par References
[1] Nvidia Systems Management Interface (nvidia-smi) https://developer.nvidia.com/nvidia-system-management-interface
[2] Greenplum 'gp_segment_configuration' table https://gpdb.docs.pivotal.io/latest/ref_guide/system_catalogs/gp_segment_configuration.html
@anchor related
@par Related Topics
See madlib_keras_gpu_info.sql_in
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gpu_info_tensorflow() RETURNS TEXT[] as
$$
PythonFunctionBodyOnlyNoSchema(`deep_learning', `madlib_keras_gpu_info')
return madlib_keras_gpu_info.GPUInfoFunctions.get_gpu_info_from_tensorflow()
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gpu_info_nvidia() RETURNS TEXT[] as
$$
PythonFunctionBodyOnlyNoSchema(`deep_learning', `madlib_keras_gpu_info')
return madlib_keras_gpu_info.GPUInfoFunctions.get_gpu_info_from_nvidia()
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gpu_configuration(output_table text, source text)
RETURNS VOID AS
$$
PythonFunctionBodyOnly(`deep_learning', `madlib_keras_gpu_info')
from utilities.control import MinWarning
with AOControl(False) and MinWarning("error"):
madlib_keras_gpu_info.gpu_configuration(schema_madlib, output_table, source)
$$
LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gpu_configuration(output_table text)
RETURNS VOID AS
$$
SELECT MADLIB_SCHEMA.gpu_configuration($1, NULL);
$$
LANGUAGE sql;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.gpu_configuration()
RETURNS VARCHAR AS $$
PythonFunctionBodyOnly(`deep_learning', `madlib_keras_gpu_info')
return madlib_keras_gpu_info.gpu_configuration_help(schema_madlib)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');