blob: 01063b254965c0f7377fd1203f3d889f8be0394b [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Breadth-First Search</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.21.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__bfs.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Breadth-First Search<div class="ingroups"><a class="el" href="group__grp__graph.html">Graph</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li>
<a href="#bfs">Breadth-First Search</a> </li>
<li>
<a href="#examples">Examples</a> </li>
<li>
<a href="#notes">Notes</a> </li>
<li>
<a href="#literature">Literature</a> </li>
</ul>
</div><p>Given a graph and a source vertex, the breadth-first search (BFS) algorithm finds all nodes reachable from the source vertex by searching / traversing the graph in a breadth-first manner.</p>
<p><a class="anchor" id="bfs"></a></p><dl class="section user"><dt>BFS</dt><dd><pre class="syntax">
graph_bfs( vertex_table,
vertex_id,
edge_table,
edge_args,
source_vertex,
out_table,
max_distance,
directed,
grouping_cols
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>vertex_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the vertex data for the graph. Must contain the column specified in the 'vertex_id' parameter below.</p>
<p class="enddd"></p>
</dd>
<dt>vertex_id </dt>
<dd><p class="startdd">TEXT, default = 'id'. Name of the column in 'vertex_table' containing vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. They do not need to be contiguous.</p>
<p class="enddd"></p>
</dd>
<dt>edge_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the edge data. The edge table must contain columns for source vertex and destination vertex. Column naming convention is described below in the 'edge_args' parameter. In addition to vertex columns, if grouping is used then the columns specified in the 'grouping_cols' parameter must be present. </p>
<p class="enddd"></p>
</dd>
<dt>edge_args </dt>
<dd><p class="startdd">TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument:</p><ul>
<li>src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. (This is not to be confused with the 'source_vertex' argument passed to the BFS function.)</li>
<li>dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</li>
</ul>
<p class="enddd"></p>
</dd>
<dt>source_vertex </dt>
<dd><p class="startdd">INTEGER or BIGINT. The source vertex id for the algorithm to start. This vertex id must exist in the 'vertex_id' column of 'vertex_table'.</p>
<p class="enddd"></p>
</dd>
<dt>out_table </dt>
<dd><p class="startdd">TEXT. Name of the table to store the result of BFS. It contains a row for every vertex that is reachable from the source_vertex. In the presence of grouping columns, only those edges are used for which there are no NULL values in any grouping column. The output table will have the following columns (in addition to the grouping columns):</p><ul>
<li>vertex_id : The id for any node reachable from source_vertex in addition to the source_vertex. Will use the input parameter 'vertex_id' for column naming.</li>
<li>dist : The distance in number of edges (or hops) from the source_vertex to where this vertex is located.</li>
<li>parent : The parent of this vertex in BFS traversal of the graph from source_vertex. Will use 'parent' for column naming. For the case where vertex_id = source_vertex, the value for parent is NULL.</li>
</ul>
<p>A summary table named &lt;out_table&gt;_summary is also created. This is an internal table that keeps a record of the input parameters. </p>
<p class="enddd"></p>
</dd>
<dt>max_distance (optional) </dt>
<dd><p class="startdd">INT, default = NULL. Maximum distance to traverse from the source vertex. When this value is null, traverses until reaches leaf node. E.g., if set to 1 will return only adjacent vertices, if set to 7 will return vertices up to a maximum distance of 7 vertices away.</p>
<p class="enddd"></p>
</dd>
<dt>directed (optional) </dt>
<dd><p class="startdd">BOOLEAN, default = FALSE. If TRUE the graph will be treated as directed, else it will be treated as an undirected graph.</p>
<p class="enddd"></p>
</dd>
<dt>grouping_cols (optional) </dt>
<dd>TEXT, default = NULL. A comma-separated list of columns used to group the input into discrete subgraphs. These columns must exist in the edge table. When this value is NULL, no grouping is used and a single BFS result is generated. <dl class="section note"><dt>Note</dt><dd>Expressions are not currently supported for 'grouping_cols'.</dd></dl>
</dd>
</dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<ol type="1">
<li>Create vertex and edge tables to represent the graph: <pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6),
(7),
(8),
(9),
(10),
(11)
;
INSERT INTO edge VALUES
(0, 5),
(1, 0),
(1, 3),
(2, 6),
(3, 4),
(3, 5),
(4, 2),
(8, 9),
(9, 10),
(9, 11),
(10, 8);
</pre></li>
<li>Traverse undirected graph from vertex 3: <pre class="syntax">
DROP TABLE IF EXISTS out, out_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertix id column (NULL means use default naming)
'edge', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out'); -- Output table of nodes reachable from source_vertex
-- Default values used for the other arguments
SELECT * FROM out ORDER BY dist,id;
</pre> <pre class="result">
id | dist | parent
----+------+--------
3 | 0 | 3
1 | 1 | 3
4 | 1 | 3
5 | 1 | 3
0 | 2 | 1
2 | 2 | 4
6 | 3 | 2
(7 rows)
</pre> <pre class="syntax">
SELECT * FROM out_summary;
</pre> <pre class="result">
vertex_table | vertex_id | edge_table | edge_args | source_vertex | out_table | max_distance | directed | grouping_cols
--------------+-----------+------------+-----------+---------------+-----------+--------------+----------+---------------
vertex | NULL | edge | NULL | 3 | out | | | NULL
(1 row)
</pre></li>
<li>In this example, we use max_distance to limit the search distance. <pre class="syntax">
DROP TABLE IF EXISTS out_max, out_max_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertix id column (NULL means use default naming)
'edge', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out_max', -- Output table of nodes reachable from source_vertex
2); -- Maximum distance to traverse from source_vertex
-- Default values used for the other arguments
SELECT * FROM out_max ORDER BY dist,id;
</pre> <pre class="result">
id | dist | parent
----+------+--------
3 | 0 | 3
1 | 1 | 3
4 | 1 | 3
5 | 1 | 3
0 | 2 | 1
2 | 2 | 4
(6 rows)
</pre></li>
<li>Now let's do an example using different column names in the tables (i.e., not the defaults). Create the vertex and edge tables: <pre class="syntax">
DROP TABLE IF EXISTS vertex_alt, edge_alt;
CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
CREATE TABLE edge_alt AS SELECT src AS n1, dest AS n2 FROM edge;
</pre></li>
<li>Run BFS from vertex 8: <pre class="syntax">
DROP TABLE IF EXISTS out_alt, out_alt_summary;
SELECT madlib.graph_bfs(
'vertex_alt', -- Vertex table
'v_id', -- Vertex id column (NULL means use default naming)
'edge_alt', -- Edge table
'src=n1, dest=n2', -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_alt'); -- Output table of nodes reachable from source_vertex
SELECT * FROM out_alt ORDER BY v_id;
</pre> <pre class="result">
v_id | dist | parent
------+------+--------
8 | 0 | 8
9 | 1 | 8
10 | 1 | 8
11 | 2 | 9
</pre></li>
<li>Now we show an example where the graph is treated as a directed graph. <pre class="syntax">
DROP TABLE IF EXISTS out_alt_dir, out_alt_dir_summary;
SELECT madlib.graph_bfs(
'vertex_alt', -- Vertex table
'v_id', -- Vertex id column (NULL means use default naming)
'edge_alt', -- Edge table
'src=n1, dest=n2', -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_alt_dir', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
TRUE); -- Flag for specifying directed graph
SELECT * FROM out_alt_dir ORDER BY v_id;
</pre> <pre class="result">
v_id | dist | parent
------+------+--------
8 | 0 | 8
9 | 1 | 8
10 | 2 | 9
11 | 2 | 9
(4 rows)
</pre> Notice that, with the graph being treated as directed, the parent of v_id=10 is now vertex 9 and not 8 as in the undirected case.</li>
<li>Create a graph with 2 groups: <pre class="syntax">
DROP TABLE IF EXISTS edge_gr;
CREATE TABLE edge_gr(
g1 INTEGER,
g2 TEXT,
src INTEGER,
dest INTEGER
);
INSERT INTO edge_gr VALUES
(100, 'a', 0, 5),
(100, 'a', 1, 0),
(100, 'a', 1, 3),
(100, 'a', 2, 6),
(100, 'a', 3, 4),
(100, 'a', 3, 5),
(100, 'a', 4, 2),
(100, 'a', 8, 9),
(100, 'a', 9, 10),
(100, 'a', 9, 11),
(100, 'a', 10, 8),
(202, 'c', 8, 9),
(202, 'c', 9, 10),
(202, 'c', 9, 11),
(202, 'c', 10, 8)
;
</pre></li>
<li>Run BFS for all groups from a given source_vertex. <pre class="syntax">
DROP TABLE IF EXISTS out_gr, out_gr_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertex id column (NULL means use default naming)
'edge_gr', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
8, -- Source vertex for BFS
'out_gr', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
NULL, -- Flag for specifying directed graph
'g1,g2' -- Grouping columns
);
SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
</pre> <pre class="result">
g1 | g2 | id | dist | parent
-----+----+----+------+--------
100 | a | 8 | 0 | 8
100 | a | 9 | 1 | 8
100 | a | 10 | 1 | 8
100 | a | 11 | 2 | 9
202 | c | 8 | 0 | 8
202 | c | 9 | 1 | 8
202 | c | 10 | 1 | 8
202 | c | 11 | 2 | 9
(8 rows)
</pre> If source_vertex is not present in a group, then that group will not appear in the output table. <pre class="syntax">
DROP TABLE IF EXISTS out_gr, out_gr_summary;
SELECT madlib.graph_bfs(
'vertex', -- Vertex table
NULL, -- Vertex id column (NULL means use default naming)
'edge_gr', -- Edge table
NULL, -- Edge arguments (NULL means use default naming)
3, -- Source vertex for BFS
'out_gr', -- Output table of nodes reachable from source_vertex
NULL, -- Maximum distance to traverse from source_vertex
NULL, -- Flag for specifying directed graph
'g1,g2' -- Grouping columns
);
SELECT * FROM out_gr ORDER BY g1,g2,dist,id;
</pre> <pre class="result">
g1 | g2 | id | dist | parent
-----+----+----+------+--------
100 | a | 3 | 0 | 3
100 | a | 1 | 1 | 3
100 | a | 4 | 1 | 3
100 | a | 5 | 1 | 3
100 | a | 0 | 2 | 1
100 | a | 2 | 2 | 4
100 | a | 6 | 3 | 2
(7 rows)
</pre></li>
</ol>
<p><a class="anchor" id="notes"></a></p><dl class="section user"><dt>Notes</dt><dd></dd></dl>
<ol type="1">
<li>On a Greenplum cluster, the edge table should be distributed by the source vertex id column for better performance.</li>
<li>The graph_bfs function is a SQL implementation of the well-known breadth-first search algorithm [1] modified appropriately for a relational database. It will find any node in the graph reachable from the 'source_vertex' only once. If a node is reachable by many different paths from the 'source_vertex' (i.e. has more than one parent), then only one of those parents is present in the output table. The BFS result will, in general, be different for different choices of 'source_vertex'.</li>
</ol>
<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] Breadth-first Search algorithm. <a href="https://en.wikipedia.org/wiki/Breadth-first_search">https://en.wikipedia.org/wiki/Breadth-first_search</a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Feb 23 2023 19:26:40 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>