blob: 195759d682a5f650666df3c22fa50ed84a1260c3 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file sssp.sql_in
*
* @brief SQL functions for graph analytics
* @date Nov 2016
*
* @sa Provides single source shortest path algorithm.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_sssp
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#sssp">SSSP</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Finds the shortest path from a single source vertex to every other vertex in a given graph.
Given a graph and a source vertex, the single source shortest path (SSSP) algorithm
finds a path from the source vertex to every other vertex in the graph,
such that the sum of the weights of the path edges is minimized.
@anchor sssp
@par SSSP
<pre class="syntax">
graph_sssp( vertex_table,
vertex_id,
edge_table,
edge_args,
source_vertex,
out_table,
grouping_cols
)
</pre>
\b Arguments
<dl class="arglist">
<dt>vertex_table</dt>
<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
column specified in the 'vertex_id' parameter below.</dd>
<dt>vertex_id</dt>
<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates.
They do not need to be contiguous.</dd>
<dt>edge_table</dt>
<dd>TEXT. Name of the table containing the edge data. The edge table must
contain columns for source vertex, destination vertex and edge weight.
Column naming convention is described below in the 'edge_args' parameter.</dd>
<dt>edge_args</dt>
<dd>TEXT. A comma-delimited string containing multiple named arguments of
the form "name=value". The following parameters are supported for
this string argument:
- src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'.
- dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.
- weight (FLOAT8): Name of the column containing the edge weights in the edge table. Default column name is 'weight'.</dd>
<dt>source_vertex</dt>
<dd>INTEGER or BIGINT. The source vertex id for the algorithm to start. This vertex id must
exist in the 'vertex_id' column of 'vertex_table'.</dd>
<dt>out_table</dt>
<dd>TEXT. Name of the table to store the result of SSSP.
It contains a row for every vertex of every group and have
the following columns (in addition to the grouping columns):
- vertex_id : The id for the destination. Will use the input parameter 'vertex_id' for column naming.
- weight : The total weight of the shortest path from the source vertex to this particular vertex.
Will use the input parameter 'weight' for column naming.
- parent : The parent of this vertex in the shortest path from source. Will use 'parent' for column naming.
A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of the input parameters and is used by the path function described below.
</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default = NULL. List of columns used to group the input into discrete subgraphs. These columns must exist in the edge table. When this value is null, no grouping is used and a single SSSP result is generated. </dd>
</dl>
@note On a Greenplum cluster, the edge table should be distributed
by the source vertex id column for better performance.
@par Path Retrieval
The path retrieval function returns the shortest path from the
source vertex to a specified desination vertex.
<pre class="syntax">
graph_sssp_get_path( sssp_table,
dest_vertex,
path_table
)
</pre>
\b Arguments
<dl class="arglist">
<dt>sssp_table</dt>
<dd>TEXT. Name of the table that contains the SSSP output.</dd>
<dt>dest_vertex</dt>
<dd>INTEGER or BIGINT. The vertex that will be the destination of the desired path.</dd>
<dt>path_table</dt>
<dd>TEXT. Name of the output table that contains the path.
It contains a row for every group and has the following columns:
- grouping_cols : The grouping columns given in the creation of the SSSP table. If there are no grouping columns, these columns will not exist and the table will have a single row.
- path (ARRAY) : The shortest path from the source vertex (as specified in the SSSP execution) to the destination vertex.
</dd>
</dl>
@anchor notes
@par Notes
The Bellman-Ford algorithm [1] is used to implement SSSP. This algorithm allows
negative edges but not negative cycles. In the case of graphs with
negative cycles, an error will be given and no output table will be generated.
Also see the Grail project [2] for more background on graph analytics processing
in relational databases.
@anchor examples
@examp
-# Create vertex and edge tables to represent the graph:
<pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER,
weight FLOAT8
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6),
(7);
INSERT INTO edge VALUES
(0, 1, 1.0),
(0, 2, 1.0),
(0, 4, 10.0),
(1, 2, 2.0),
(1, 3, 10.0),
(2, 3, 1.0),
(2, 5, 1.0),
(2, 6, 3.0),
(3, 0, 1.0),
(4, 0, -2.0),
(5, 6, 1.0),
(6, 7, 1.0);
</pre>
-# Calculate the shortest paths from vertex 0:
<pre class="syntax">
DROP TABLE IF EXISTS out, out_summary;
SELECT madlib.graph_sssp(
'vertex', -- Vertex table
NULL, -- Vertix id column (NULL means use default naming)
'edge', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
0, -- Source vertex for path calculation
'out'); -- Output table of shortest paths
SELECT * FROM out ORDER BY id;
</pre>
<pre class="result">
id | weight | parent
----+--------+--------
0 | 0 | 0
1 | 1 | 0
2 | 1 | 0
3 | 2 | 2
4 | 10 | 0
5 | 2 | 2
6 | 3 | 5
7 | 4 | 6
(8 rows)
</pre>
-# Get the shortest path to vertex 5:
<pre class="syntax">
DROP TABLE IF EXISTS out_path;
SELECT madlib.graph_sssp_get_path('out',5,'out_path');
SELECT * FROM out_path;
</pre>
<pre class="result">
path
\---------
{0,2,5}
</pre>
-# Now let's do a similar example except using
different column names in the tables (i.e., not the defaults).
Create the vertex and edge tables:
<pre class="syntax">
DROP TABLE IF EXISTS vertex_alt, edge_alt;
CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM edge;
</pre>
-# Get the shortest path from vertex 1:
<pre class="syntax">
DROP TABLE IF EXISTS out_alt, out_alt_summary;
SELECT madlib.graph_sssp(
'vertex_alt', -- Vertex table
'v_id', -- Vertex id column (NULL means use default naming)
'edge_alt', -- Edge table
'src=e_src, weight=e_weight', -- Edge arguments (NULL means use default naming)
1, -- Source vertex for path calculation
'out_alt'); -- Output table of shortest paths
SELECT * FROM out_alt ORDER BY v_id;
</pre>
<pre class="result">
v_id | e_weight | parent
------+----------+--------
0 | 4 | 3
1 | 0 | 1
2 | 2 | 1
3 | 3 | 2
4 | 14 | 0
5 | 3 | 2
6 | 4 | 5
7 | 5 | 6
(8 rows)
</pre>
-# Create a graph with 2 groups:
<pre class="syntax">
DROP TABLE IF EXISTS edge_gr;
CREATE TABLE edge_gr AS
(
SELECT *, 0 AS grp FROM edge
UNION
SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
);
INSERT INTO edge_gr VALUES
(4,5,-20,1);
</pre>
-# Find SSSP for all groups
<pre class="syntax">
DROP TABLE IF EXISTS out_gr, out_gr_summary;
SELECT madlib.graph_sssp(
'vertex', -- Vertex table
NULL, -- Vertex id column (NULL means use default naming)
'edge_gr', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
0, -- Source vertex for path calculation
'out_gr', -- Output table of shortest paths
'grp' -- Grouping columns
);
SELECT * FROM out_gr ORDER BY grp,id;
</pre>
<pre class="result">
grp | id | weight | parent
-----+----+--------+--------
0 | 0 | 0 | 0
0 | 1 | 1 | 0
0 | 2 | 1 | 0
0 | 3 | 2 | 2
0 | 4 | 10 | 0
0 | 5 | 2 | 2
0 | 6 | 3 | 5
0 | 7 | 4 | 6
1 | 0 | 0 | 0
1 | 1 | 1 | 0
1 | 2 | 1 | 0
1 | 3 | 2 | 2
1 | 4 | 10 | 0
1 | 5 | -10 | 4
</pre>
-# Find the path to vertex 5 in every group
<pre class="syntax">
DROP TABLE IF EXISTS out_gr_path;
SELECT madlib.graph_sssp_get_path('out_gr',5,'out_gr_path');
SELECT * FROM out_gr_path ORDER BY grp;
</pre>
<pre class="result">
grp | path
-----+---------
0 | {0,2,5}
1 | {0,4,5}
</pre>
@anchor literature
@par Literature
[1] Bellman–Ford algorithm. https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm
[2] The case against specialized graph analytics engines, J. Fan, G. Soosai Raj,
and J. M. Patel. CIDR 2015. http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf
*/
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT,
grouping_cols TEXT
) RETURNS VOID AS $$
PythonFunction(graph, sssp, graph_sssp)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.graph_sssp($1, $2, $3, $4, $5, $6, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp_get_path(
sssp_table TEXT,
dest_vertex BIGINT,
path_table TEXT
) RETURNS VOID AS $$
PythonFunction(graph, sssp, graph_sssp_get_path)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(graph, sssp, graph_sssp_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.graph_sssp('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------------