blob: f9507d94b7ea682cca43bd08ac9a4d44dc47236d [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file bfs.sql_in
*
* @brief SQL functions for graph analytics
* @date Jun 2017
*
* @sa Provides a breadth first search graph algorithm.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_bfs
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#bfs">Breadth-First Search</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Finds the nodes reachable from a given source vertex using a breadth-first approach.
Given a graph and a source vertex, the breadth-first search (BFS) algorithm
finds all nodes reachable from the source vertex by searching / traversing the graph
in a breadth-first manner.
@anchor bfs
@par BFS
<pre class="syntax">
graph_bfs( vertex_table,
vertex_id,
edge_table,
edge_args,
source_vertex,
out_table,
max_distance,
directed,
grouping_cols
)
</pre>
\b Arguments
<dl class="arglist">
<dt>vertex_table</dt>
<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
column specified in the 'vertex_id' parameter below.</dd>
<dt>vertex_id</dt>
<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates.
They do not need to be contiguous.</dd>
<dt>edge_table</dt>
<dd>TEXT. Name of the table containing the edge data. The edge table must contain
columns for source vertex and destination vertex. Column naming convention is
described below in the 'edge_args' parameter.
In addition to vertex columns, if grouping is used then the columns specified
in the 'grouping_cols' parameter must be present. </dd>
<dt>edge_args</dt>
<dd>TEXT. A comma-delimited string containing multiple named arguments of
the form "name=value". The following parameters are supported for
this string argument:
- src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table.
Default column name is 'src'.
(This is not to be confused with the 'source_vertex' argument passed to the BFS function.)
- dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in
the edge table. Default column name is 'dest'.
<dt>source_vertex</dt>
<dd>INTEGER or BIGINT. The source vertex id for the algorithm to start. This vertex id must
exist in the 'vertex_id' column of 'vertex_table'.</dd>
<dt>out_table</dt>
<dd>TEXT. Name of the table to store the result of BFS.
It contains a row for every vertex that is reachable from the source_vertex.
In the presence of grouping columns, only those edges are used for which there are no NULL values
in any grouping column.
The output table will have the following columns (in addition to the grouping columns):
- vertex_id : The id for any node reachable from source_vertex in addition to
the source_vertex. Will use the input parameter 'vertex_id' for
column naming.
- dist : The distance in number of edges (or hops) from the source_vertex
to where this vertex is located.
- parent : The parent of this vertex in BFS traversal of the graph from source_vertex.
Will use 'parent' for column naming. For the
case where vertex_id = source_vertex, the value for parent is NULL.
A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of the input parameters.
</dd>
<dt>max_distance (optional)</dt>
<dd>INT, default = NULL. Maximum distance to traverse
from the source vertex. When this value is null,
traverses until reaches leaf node. E.g., if set
to 1 will return only adjacent vertices, if set
to 7 will return vertices up to a maximum distance
of 7 vertices away.
<dt>directed (optional)</dt>
<dd>BOOLEAN, default = FALSE. If TRUE the graph will be treated as directed, else it will be treated as an undirected graph.</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default = NULL. A comma-separated list of columns used to group the
input into discrete subgraphs.
These columns must exist in the edge table. When this value is NULL, no grouping is used
and a single BFS result is generated.
@note Expressions are not currently supported for 'grouping_cols'.</dd>
</dl>
@note On a Greenplum cluster, the edge table should be distributed
by the source vertex id column for better performance.
@anchor notes
@par Notes
The graph_bfs function is a SQL implementation of the well-known breadth-first
search algorithm [1] modified appropriately for a relational database. It will
find any node in the graph reachable from the source_vertex only once. If a node
is reachable by many different paths from the source_vertex (i.e. has more than
one parent), then only one of those parents is present in the output table.
The BFS result will, in general, be different for different choices of source_vertex.
@anchor examples
@examp
-# Create vertex and edge tables to represent the graph:
<pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6),
(7),
(8),
(9),
(10),
(11)
;
INSERT INTO edge VALUES
(0, 5),
(1, 0),
(1, 3),
(2, 6),
(3, 4),
(3, 5),
(4, 2),
(8, 9),
(9, 10),
(9, 11),
(10, 8);
</pre>
-# Traverse undirected graph from vertex 3:
<pre class="syntax">
DROP TABLE IF EXISTS out, out_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertix id column (NULL means use default naming)
'edge', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out'); -- Output table of nodes reachable from source_vertex
-- Default values used for the other arguments
SELECT * FROM out ORDER BY dist,id;
</pre>
<pre class="result">
id | dist | parent
----+------+--------
3 | 0 | 3
1 | 1 | 3
4 | 1 | 3
5 | 1 | 3
0 | 2 | 1
2 | 2 | 4
6 | 3 | 2
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM out_summary;
</pre>
<pre class="result">
vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table | max_distance | directed | grouping_cols
--------------+-----------+------------+-----------+---------------+-----------+--------------+----------+---------------
vertex | NULL | edge | NULL | 3 | out | | | NULL
(1 row)
</pre>
-# In this example, we use max_distance to limit the search distance.
<pre class="syntax">
DROP TABLE IF EXISTS out_max, out_max_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertix id column (NULL means use default naming)
'edge', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out_max', -- Output table of nodes reachable from source_vertex
2); -- Maximum distance to traverse from source_vertex
-- Default values used for the other arguments
SELECT * FROM out_max ORDER BY dist,id;
</pre>
<pre class="result">
id | dist | parent
----+------+--------
3 | 0 | 3
1 | 1 | 3
4 | 1 | 3
5 | 1 | 3
0 | 2 | 1
2 | 2 | 4
(6 rows)
</pre>
-# Now let's do an example using
different column names in the tables (i.e., not the defaults).
Create the vertex and edge tables:
<pre class="syntax">
DROP TABLE IF EXISTS vertex_alt, edge_alt;
CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
CREATE TABLE edge_alt AS SELECT src AS n1, dest AS n2 FROM edge;
</pre>
-# Run BFS from vertex 8:
<pre class="syntax">
DROP TABLE IF EXISTS out_alt, out_alt_summary;
SELECT madlib.graph_bfs(
'vertex_alt', -- Vertex table
'v_id', -- Vertex id column (NULL means use default naming)
'edge_alt', -- Edge table
'src=n1, dest=n2', -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_alt'); -- Output table of nodes reachable from source_vertex
SELECT * FROM out_alt ORDER BY v_id;
</pre>
<pre class="result">
v_id | dist | parent
------+------+--------
8 | 0 | 8
9 | 1 | 8
10 | 1 | 8
11 | 2 | 9
</pre>
-# Now we show an example where the graph is treated as a directed graph.
<pre class="syntax">
DROP TABLE IF EXISTS out_alt_dir, out_alt_dir_summary;
SELECT madlib.graph_bfs(
'vertex_alt', -- Vertex table
'v_id', -- Vertex id column (NULL means use default naming)
'edge_alt', -- Edge table
'src=n1, dest=n2', -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_alt_dir', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
TRUE); -- Flag for specifying directed graph
SELECT * FROM out_alt_dir ORDER BY v_id;
</pre>
<pre class="result">
v_id | dist | parent
------+------+--------
8 | 0 | 8
9 | 1 | 8
10 | 2 | 9
11 | 2 | 9
(4 rows)
</pre>
Notice that, with the graph being treated as directed, the parent of v_id=10
is now vertex 9 and not 8 as in the undirected case.
-# Create a graph with 2 groups:
<pre class="syntax">
DROP TABLE IF EXISTS edge_gr;
CREATE TABLE edge_gr(
g1 INTEGER,
g2 TEXT,
src INTEGER,
dest INTEGER
);
INSERT INTO edge_gr VALUES
(100, 'a', 0, 5),
(100, 'a', 1, 0),
(100, 'a', 1, 3),
(100, 'a', 2, 6),
(100, 'a', 3, 4),
(100, 'a', 3, 5),
(100, 'a', 4, 2),
(100, 'a', 8, 9),
(100, 'a', 9, 10),
(100, 'a', 9, 11),
(100, 'a', 10, 8),
(202, 'c', 8, 9),
(202, 'c', 9, 10),
(202, 'c', 9, 11),
(202, 'c', 10, 8)
;
</pre>
-# Run BFS for all groups from a given source_vertex.
<pre class="syntax">
DROP TABLE IF EXISTS out_gr, out_gr_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertex id column (NULL means use default naming)
'edge_gr', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_gr', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
NULL, -- Flag for specifying directed graph
'g1,g2' -- Grouping columns
);
SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
</pre>
<pre class="result">
g1 | g2 | id | dist | parent
-----+----+----+------+--------
100 | a | 8 | 0 | 8
100 | a | 9 | 1 | 8
100 | a | 10 | 1 | 8
100 | a | 11 | 2 | 9
202 | c | 8 | 0 | 8
202 | c | 9 | 1 | 8
202 | c | 10 | 1 | 8
202 | c | 11 | 2 | 9
(8 rows)
</pre>
If source_vertex is not present in
a group, then that group will not appear in the output table.
<pre class="syntax">
DROP TABLE IF EXISTS out_gr, out_gr_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertex id column (NULL means use default naming)
'edge_gr', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out_gr', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
NULL, -- Flag for specifying directed graph
'g1,g2' -- Grouping columns
);
SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
</pre>
<pre class="result">
g1 | g2 | id | dist | parent
-----+----+----+------+--------
100 | a | 3 | 0 | 3
100 | a | 1 | 1 | 3
100 | a | 4 | 1 | 3
100 | a | 5 | 1 | 3
100 | a | 0 | 2 | 1
100 | a | 2 | 2 | 4
100 | a | 6 | 3 | 2
(7 rows)
</pre>
@anchor literature
@par Literature
[1] Breadth-first Search algorithm. https://en.wikipedia.org/wiki/Breadth-first_search
*/
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT,
max_distance INT,
directed BOOLEAN,
grouping_cols TEXT
) RETURNS VOID AS $$
PythonFunction(graph, bfs, graph_bfs)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT,
max_distance INT,
directed BOOLEAN
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, $7, $8, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT,
max_distance INT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, $7, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
source_vertex BIGINT,
out_table TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.graph_bfs($1, $2, $3, $4, $5, $6, NULL, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(graph, bfs, graph_bfs_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
--------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_bfs()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.graph_bfs('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
--------------------------------------------------------------------------------