| <!-- HTML header for doxygen 1.8.4--> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <meta http-equiv="X-UA-Compatible" content="IE=9"/> |
| <meta name="generator" content="Doxygen 1.8.13"/> |
| <meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> |
| <title>MADlib: Weakly Connected Components</title> |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtreedata.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/searchdata.js"></script> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { init_search(); }); |
| </script> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script> |
| <!-- hack in the navigation tree --> |
| <script type="text/javascript" src="eigen_navtree_hacks.js"></script> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="madlib_extra.css" rel="stylesheet" type="text/css"/> |
| <!-- google analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| ga('create', 'UA-45382226-1', 'madlib.apache.org'); |
| ga('send', 'pageview'); |
| </script> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div, it is closed by doxygen! --> |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname"> |
| <span id="projectnumber">1.16</span> |
| </div> |
| <div id="projectbrief">User Documentation for Apache MADlib</div> |
| </td> |
| <td> <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <!-- end header part --> |
| <!-- Generated by Doxygen 1.8.13 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| </div><!-- top --> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| <div id="nav-sync" class="sync"></div> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| $(document).ready(function(){initNavTree('group__grp__wcc.html','');}); |
| </script> |
| <div id="doc-content"> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| </div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">Weakly Connected Components<div class="ingroups"><a class="el" href="group__grp__graph.html">Graph</a></div></div> </div> |
| </div><!--header--> |
| <div class="contents"> |
| <div class="toc"><b>Contents</b> <ul> |
| <li> |
| <a href="#wcc">Weakly Connected Components</a> </li> |
| <li> |
| <a href="#rlcc">Retrieve Largest Connected Component</a> </li> |
| <li> |
| <a href="#hist">Build Histogram</a> </li> |
| <li> |
| <a href="#samecpt">Check Vertices in Same Connected Component</a> </li> |
| <li> |
| <a href="#reach">Retrieve Reachable Vertices</a> </li> |
| <li> |
| <a href="#count">Count Connected Components</a> </li> |
| <li> |
| <a href="#examples">Examples</a> </li> |
| </ul> |
| </div><p>Given a directed graph, a weakly connected component (WCC) is a subgraph of the original graph where all vertices are connected to each other by some path, ignoring the direction of edges. In case of an undirected graph, a weakly connected component is also a strongly connected component. This module also includes a number of helper functions that operate on the WCC output.</p> |
| <p><a class="anchor" id="wcc"></a></p><dl class="section user"><dt>Weakly Connected Components</dt><dd><pre class="syntax"> |
| weakly_connected_components( vertex_table, |
| vertex_id, |
| edge_table, |
| edge_args, |
| out_table, |
| grouping_cols |
| ) |
| </pre></dd></dl> |
| <p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>vertex_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table containing the vertex data for the graph. Must contain the column specified in the 'vertex_id' parameter below.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>vertex_id </dt> |
| <dd><p class="startdd">TEXT, default = 'id'. Name of the column in 'vertex_table' containing vertex ids. The vertex ids are of type INTEGER with no duplicates. They do not need to be contiguous.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>edge_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table containing the edge data. The edge table must contain columns for source vertex and destination vertex.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>edge_args </dt> |
| <dd><p class="startdd">TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument:</p><ul> |
| <li>src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'.</li> |
| <li>dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</li> |
| </ul> |
| <p class="enddd"></p> |
| </dd> |
| <dt>out_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table to store the component ID associated with each vertex. It will contain a row for every vertex from 'vertex_table' with the following columns:</p><ul> |
| <li>vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming.</li> |
| <li>component_id : Component that the vertex belongs to. We use the convention where 'component_id' is the id of the first vertex in a particular group. It means that component ids are generally not contiguous.</li> |
| <li>grouping_cols : Grouping column (if any) values associated with the vertex_id.</li> |
| </ul> |
| <p>A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of some of the input parameters and is used by the weakly connected component helper functions. </p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>grouping_cols (optional) </dt> |
| <dd>TEXT, default: NULL. A single column or a list of comma-separated columns that divides the input data into discrete groups, which are treated independently as separate graphs. When this value is NULL, no grouping is used and weakly connected components are generated for all data (single graph). <dl class="section note"><dt>Note</dt><dd>Expressions are not currently supported for 'grouping_cols'.</dd></dl> |
| </dd> |
| </dl> |
| <dl class="section note"><dt>Note</dt><dd>On a Greenplum cluster, the edge table should be distributed by the source vertex id column for better performance. In addition, the user should note that this function creates a duplicate of the edge table (on Greenplum cluster) for better performance.</dd></dl> |
| <p><a class="anchor" id="rlcc"></a></p><dl class="section user"><dt>Retrieve Largest Connected Component</dt><dd></dd></dl> |
| <p>The largest connected component retrieval function finds the largest weakly connected component(s) in a graph. If weakly connected components was run with grouping, the largest connected components are computed for each group.</p> |
| <pre class="syntax"> |
| graph_wcc_largest_cpt( wcc_table, |
| largest_cpt_table |
| ) |
| </pre><p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>wcc_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table that contains the output of weakly connected components.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>largest_cpt_table </dt> |
| <dd>TEXT. Name of the output table that contains the largest component's information. It contains one or more rows for every group and has the following columns:<ul> |
| <li>grouping_cols: The grouping columns given in the creation of wcc_table. If there are no grouping columns, this column is not created.</li> |
| <li>component_id: The ID of the largest component. Recall that we use the convention where 'component_id' is the id of the first vertex in a particular group. It means that component ids are generally not contiguous. If there are multiple components of the same size, a row is created for each component. If grouping_cols is specified, the largest component is computed for each group.</li> |
| <li>num_vertices: Number of vertices in the largest component. </li> |
| </ul> |
| </dd> |
| </dl> |
| <p><a class="anchor" id="hist"></a></p><dl class="section user"><dt>Retrieve Histogram of Vertices Per Connected Component</dt><dd></dd></dl> |
| <p>This function creates a histogram of the number of vertices per connected component.</p> |
| <pre class="syntax"> |
| graph_wcc_histogram( wcc_table, |
| histogram_table |
| ) |
| </pre><p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>wcc_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table that contains the output of weakly connected components.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>histogram_table </dt> |
| <dd><p class="startdd">TEXT. Name of the output table that contains the number of vertices per component. A row is created for every comoponent in every group if grouping_cols was specified when running weakly connected components. The output table has the following columns:</p><ul> |
| <li>grouping_cols: The grouping columns given during the creation of the wcc_table. If there are no grouping columns, this column is not created.</li> |
| <li>component_id: The ID of the component.</li> |
| <li>num_vertices: Number of vertices in the component specified by the component_id column.</li> |
| </ul> |
| <p class="enddd"></p> |
| </dd> |
| </dl> |
| <p><a class="anchor" id="samecpt"></a></p><dl class="section user"><dt>Check if Two Vertices Belong to the Same Component</dt><dd></dd></dl> |
| <p>This function determines if two vertices belong to the same component.</p> |
| <pre class="syntax"> |
| graph_wcc_vertex_check( wcc_table, |
| vertex_pair, |
| pair_table |
| ) |
| </pre><p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>wcc_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table that contains the output of weakly connected components.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>vertex_pair </dt> |
| <dd><p class="startdd">TEXT. A pair of vertex IDs separated by a comma.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>pair_table </dt> |
| <dd><p class="startdd">TEXT. Name of the output table that specifies if the two vertices in vertex_pair belong to the same component. If wcc_table was generated using grouping_cols, all the components in all groups are considered. The output table has the following columns:</p><ul> |
| <li>component_id: Component ID that contains both the vertices in vertex_pair.</li> |
| <li>grouping_cols: The grouping columns given in the creation of wcc_table. If there are no grouping columns, this column is not created.</li> |
| </ul> |
| <p class="enddd"></p> |
| </dd> |
| </dl> |
| <p><a class="anchor" id="reach"></a></p><dl class="section user"><dt>Retrieve All Vertices Reachable from a Vertex</dt><dd></dd></dl> |
| <p>This function finds all the vertices that can be reached from a given vertex via weakly connected paths.</p> |
| <pre class="syntax"> |
| graph_wcc_reachable_vertices( wcc_table, |
| src, |
| reachable_vertices_table |
| ) |
| </pre><p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>wcc_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table that contains the output of weakly connected components.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>src </dt> |
| <dd><p class="startdd">TEXT. The vertex ID from which all reachable vertices have to be found.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>reachable_vertices_table </dt> |
| <dd><p class="startdd">TEXT. Name of the output table that contains the list of vertices that are reachable from the src vertex. The output table has the following columns:</p><ul> |
| <li>grouping_cols : The grouping columns given in the creation of wcc_table. If there are no grouping columns, this column is not created.</li> |
| <li>component_id : The ID of the component that both the src and dest vertices belong to.</li> |
| <li>dest : Vertex ID that is reachable from the src vertex. Reachability is computed with regard to a component.</li> |
| </ul> |
| <p class="enddd"></p> |
| </dd> |
| </dl> |
| <p><a class="anchor" id="count"></a></p><dl class="section user"><dt>Count of Connected Components</dt><dd></dd></dl> |
| <p>This function finds the total number of components in the input graph.</p> |
| <pre class="syntax"> |
| graph_wcc_num_cpts( wcc_table, |
| count_table |
| ) |
| </pre><p><b>Arguments</b> </p><dl class="arglist"> |
| <dt>wcc_table </dt> |
| <dd><p class="startdd">TEXT. Name of the table that contains the output of weakly connected components.</p> |
| <p class="enddd"></p> |
| </dd> |
| <dt>count_table </dt> |
| <dd><p class="startdd">TEXT. Name of the output table that contains the total number of components per group in the graph, if there are any grouping_cols in wcc_table. The output table has the following columns:</p><ul> |
| <li>grouping_cols : The grouping columns given in the creation of wcc_table. If there are no grouping columns, this column is not created, and count is with regard to the entire graph.</li> |
| <li>num_components : Count of weakly connected components in a graph, or the number of components within a group if grouping_cols is defined. </li> |
| </ul> |
| <p class="enddd"></p> |
| </dd> |
| </dl> |
| <p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl> |
| <ol type="1"> |
| <li>Create vertex and edge tables to represent the graph: <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex, edge; |
| CREATE TABLE vertex( |
| id INTEGER |
| ); |
| CREATE TABLE edge( |
| src INTEGER, |
| dest INTEGER, |
| user_id INTEGER |
| ); |
| INSERT INTO vertex VALUES |
| (0), |
| (1), |
| (2), |
| (3), |
| (4), |
| (5), |
| (6), |
| (10), |
| (11), |
| (12), |
| (13), |
| (14), |
| (15), |
| (16); |
| INSERT INTO edge VALUES |
| (0, 1, 1), |
| (0, 2, 1), |
| (1, 2, 1), |
| (1, 3, 1), |
| (2, 3, 1), |
| (2, 5, 1), |
| (2, 6, 1), |
| (3, 0, 1), |
| (5, 6, 1), |
| (6, 3, 1), |
| (10, 11, 2), |
| (10, 12, 2), |
| (11, 12, 2), |
| (11, 13, 2), |
| (12, 13, 2), |
| (13, 10, 2), |
| (15, 16, 2), |
| (15, 14, 2); |
| </pre></li> |
| <li>Find all the weakly connected components in the graph: <pre class="syntax"> |
| DROP TABLE IF EXISTS wcc_out, wcc_out_summary; |
| SELECT madlib.weakly_connected_components( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'wcc_out'); -- Output table of weakly connected components |
| SELECT * FROM wcc_out ORDER BY component_id, id; |
| </pre> <pre class="result"> |
| id | component_id |
| ----+-------------- |
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 5 | 0 |
| 6 | 0 |
| 4 | 4 |
| 10 | 10 |
| 11 | 10 |
| 12 | 10 |
| 13 | 10 |
| 14 | 14 |
| 15 | 14 |
| 16 | 14 |
| (14 rows) |
| </pre></li> |
| <li>Now get the weakly connected components associated with each 'user_id' using the grouping feature: <pre class="syntax"> |
| DROP TABLE IF EXISTS wcc_out, wcc_out_summary; |
| SELECT madlib.weakly_connected_components( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'wcc_out', -- Output table of weakly connected components |
| 'user_id'); -- Grouping column name |
| SELECT * FROM wcc_out ORDER BY user_id, component_id, id; |
| </pre> <pre class="result"> |
| id | component_id | user_id |
| ----+--------------+--------- |
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 5 | 0 | 1 |
| 6 | 0 | 1 |
| 10 | 10 | 2 |
| 11 | 10 | 2 |
| 12 | 10 | 2 |
| 13 | 10 | 2 |
| 14 | 14 | 2 |
| 15 | 14 | 2 |
| 16 | 14 | 2 |
| (13 rows) |
| </pre> Note that vertex 4 is not identified as a separate component above. This is because there is no entry in the edge table for vertex 4 indicating which group it belongs to (though you could do that if you wanted to).</li> |
| <li>Retrieve the largest connected component: <pre class="syntax"> |
| DROP TABLE IF EXISTS largest_cpt_table; |
| SELECT madlib.graph_wcc_largest_cpt( |
| 'wcc_out', -- WCC output table |
| 'largest_cpt_table'); -- output table containing largest component ID |
| SELECT * FROM largest_cpt_table ORDER BY component_id; |
| </pre> <pre class="result"> |
| user_id | component_id | num_vertices |
| ---------+--------------+-------------- |
| 1 | 0 | 6 |
| 2 | 10 | 4 |
| (2 rows) |
| </pre></li> |
| <li>Retrieve histogram of the number of vertices per connected component: <pre class="syntax"> |
| DROP TABLE IF EXISTS histogram_table; |
| SELECT madlib.graph_wcc_histogram( |
| 'wcc_out', -- WCC output table |
| 'histogram_table'); -- output table containing the histogram of vertices |
| SELECT * FROM histogram_table ORDER BY component_id; |
| </pre> <pre class="result"> |
| user_id | component_id | num_vertices |
| ---------+--------------+-------------- |
| 1 | 0 | 6 |
| 2 | 10 | 4 |
| 2 | 14 | 3 |
| (3 rows) |
| </pre></li> |
| <li>Check if two vertices belong to the same component: <pre class="syntax"> |
| DROP TABLE IF EXISTS vc_table; |
| SELECT madlib.graph_wcc_vertex_check( |
| 'wcc_out', -- WCC output table |
| '14,15', -- Pair of vertex IDs |
| 'vc_table'); -- output table containing components that contain the two vertices |
| SELECT * FROM vc_table ORDER BY component_id; |
| </pre> <pre class="result"> |
| user_id | component_id |
| ---------+-------------- |
| 2 | 14 |
| (1 row) |
| </pre></li> |
| <li>Retrieve all vertices reachable from a vertex <pre class="syntax"> |
| DROP TABLE IF EXISTS reach_table; |
| SELECT madlib.graph_wcc_reachable_vertices( |
| 'wcc_out', -- WCC output table |
| '0', -- source vertex |
| 'reach_table'); -- output table containing all vertices reachable from source vertex |
| SELECT * FROM reach_table ORDER BY component_id, dest; |
| </pre> <pre class="result"> |
| user_id | component_id | dest |
| ---------+--------------+------ |
| 1 | 0 | 1 |
| 1 | 0 | 2 |
| 1 | 0 | 3 |
| 1 | 0 | 5 |
| 1 | 0 | 6 |
| (5 rows) |
| </pre></li> |
| <li>Count of connected components: <pre class="syntax"> |
| DROP TABLE IF EXISTS count_table; |
| SELECT madlib.graph_wcc_num_cpts( |
| 'wcc_out', -- WCC output table |
| 'count_table'); -- output table containing number of components per group |
| SELECT * FROM count_table; |
| </pre> <pre class="result"> |
| user_id | num_components |
| ------—+-------------— |
| 1 | 1 |
| 2 | 2 |
| (2 rows) |
| </pre> </li> |
| </ol> |
| </div><!-- contents --> |
| </div><!-- doc-content --> |
| <!-- start footer part --> |
| <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> |
| <ul> |
| <li class="footer">Generated on Tue Jul 2 2019 22:35:51 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li> |
| </ul> |
| </div> |
| </body> |
| </html> |