| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file graph.sql_in |
| * |
| * @brief SQL functions for graph analytics |
| * @date June 2017 |
| * |
| * @sa Provides various graph algorithms. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_wcc |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#wcc">Weakly Connected Components</a></li> |
| <li><a href="#rlcc">Retrieve Largest Connected Component</a></li> |
| <li><a href="#hist">Build Histogram</a></li> |
| <li><a href="#samecpt">Check Vertices in Same Connected Component</a></li> |
| <li><a href="#reach">Retrieve Reachable Vertices</a></li> |
| <li><a href="#count">Count Connected Components</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#notes">Notes</a></li> |
| </ul> |
| </div> |
| |
| @brief Find all weakly connected components of a graph. |
| |
| Given a directed graph, a weakly connected component (WCC) is a subgraph of |
| the original graph where all vertices are connected to each other by some path, |
| ignoring the direction of edges. In case of an undirected graph, a weakly |
| connected component is also a strongly connected component. This module also |
| includes a number of helper functions that operate on the WCC output. |
| |
| @anchor wcc |
| @par Weakly Connected Components |
| <pre class="syntax"> |
| weakly_connected_components( vertex_table, |
| vertex_id, |
| edge_table, |
| edge_args, |
| out_table, |
| grouping_cols |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>vertex_table</dt> |
| <dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the |
| column specified in the 'vertex_id' parameter below.</dd> |
| |
| <dt>vertex_id</dt> |
| <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing |
| vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. |
| They do not need to be contiguous.</dd> |
| |
| <dt>edge_table</dt> |
| <dd>TEXT. Name of the table containing the edge data. The edge table must |
| contain columns for source vertex and destination vertex.</dd> |
| |
| <dt>edge_args</dt> |
| <dd>TEXT. A comma-delimited string containing multiple named arguments of |
| the form "name=value". The following parameters are supported for |
| this string argument: |
| - src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'. |
| - dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</dd> |
| |
| <dt>out_table</dt> |
| <dd>TEXT. Name of the table to store the component ID associated with each vertex. |
| It will contain a row for every vertex from 'vertex_table' with |
| the following columns: |
| - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming. |
| - component_id : Component that the vertex belongs to. |
| We use the convention where 'component_id' is the id of |
| the first vertex in a particular group. It means that component ids |
| are generally not contiguous. |
| - grouping_cols : Grouping column (if any) values associated with the vertex_id. |
| |
| A summary table named <out_table>_summary is also created. This is an internal |
| table that keeps a record of some of the input parameters and is used by the |
| weakly connected component helper functions. |
| </dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: NULL. A single column or a list of comma-separated |
| columns that divides the input data into discrete groups, which are |
| treated independently as separate graphs. |
| When this value is NULL, no grouping is used and |
| weakly connected components are generated for all data |
| (single graph). |
| @note Expressions are not currently supported for 'grouping_cols'.</dd> |
| |
| </dl> |
| |
| @note On a Greenplum cluster, the edge table should be distributed |
| by the source vertex id column for better performance. |
| In addition, the user should note that this |
| function creates a duplicate of the edge table (on Greenplum cluster) for |
| better performance. |
| |
| @anchor rlcc |
| @par Retrieve Largest Connected Component |
| |
| The largest connected component retrieval function finds the largest weakly |
| connected component(s) in a graph. If weakly connected components was run with |
| grouping, the largest connected components are computed for each group. |
| |
| <pre class="syntax"> |
| graph_wcc_largest_cpt( wcc_table, |
| largest_cpt_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>wcc_table</dt> |
| <dd>TEXT. Name of the table that contains the output of weakly connected |
| components.</dd> |
| |
| <dt>largest_cpt_table</dt> |
| <dd>TEXT. Name of the output table that contains the largest component's |
| information. It contains one or more rows for every group and has the following |
| columns: |
| - grouping_cols: The grouping columns given in the creation of wcc_table. |
| If there are no grouping columns, this column is not created. |
| - component_id: The ID of the largest component. Recall that we use the |
| convention where 'component_id' is the id of the first vertex in a |
| particular group. It means that component ids are generally not contiguous. |
| If there are multiple components of the same size, a row is created for each |
| component. If grouping_cols is specified, the largest |
| component is computed for each group. |
| - num_vertices: Number of vertices in the largest component. |
| </dd> |
| </dl> |
| |
| @anchor hist |
| @par Retrieve Histogram of Vertices Per Connected Component |
| |
| This function creates a histogram of the number of vertices |
| per connected component. |
| |
| <pre class="syntax"> |
| graph_wcc_histogram( wcc_table, |
| histogram_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>wcc_table</dt> |
| <dd>TEXT. Name of the table that contains the output of weakly connected |
| components.</dd> |
| |
| <dt>histogram_table</dt> |
| <dd>TEXT. Name of the output table that contains the number of vertices per |
| component. A row is created for every comoponent in every group |
| if grouping_cols was specified when running weakly connected components. |
| The output table has the following columns: |
| - grouping_cols: The grouping columns given during the creation of the |
| wcc_table. If there are no grouping columns, this column |
| is not created. |
| - component_id: The ID of the component. |
| - num_vertices: Number of vertices in the component specified by the |
| component_id column. |
| |
| </dd> |
| </dl> |
| |
| @anchor samecpt |
| @par Check if Two Vertices Belong to the Same Component |
| |
| This function determines if two vertices belong to the same component. |
| |
| <pre class="syntax"> |
| graph_wcc_vertex_check( wcc_table, |
| vertex_pair, |
| pair_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>wcc_table</dt> |
| <dd>TEXT. Name of the table that contains the output of weakly connected |
| components.</dd> |
| |
| <dt>vertex_pair</dt> |
| <dd>TEXT. A pair of vertex IDs separated by a comma.</dd> |
| |
| <dt>pair_table</dt> |
| <dd>TEXT. Name of the output table that specifies if the two vertices in |
| vertex_pair belong to the same component. If wcc_table was generated using |
| grouping_cols, all the components in all groups are considered. The output |
| table has the following columns: |
| - component_id: Component ID that contains both the vertices in vertex_pair. |
| - grouping_cols: The grouping columns given in the creation of wcc_table. If |
| there are no grouping columns, this column is not created. |
| |
| </dd> |
| </dl> |
| |
| @anchor reach |
| @par Retrieve All Vertices Reachable from a Vertex |
| |
| This function finds all the vertices that can be reached from a given vertex |
| via weakly connected paths. |
| |
| <pre class="syntax"> |
| graph_wcc_reachable_vertices( wcc_table, |
| src, |
| reachable_vertices_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>wcc_table</dt> |
| <dd>TEXT. Name of the table that contains the output of weakly connected |
| components.</dd> |
| |
| <dt>src</dt> |
| <dd>TEXT. The vertex ID from which all reachable vertices have to be found.</dd> |
| |
| <dt>reachable_vertices_table</dt> |
| <dd>TEXT. Name of the output table that contains the list of vertices that are |
| reachable from the src vertex. The output table has the following columns: |
| - grouping_cols : The grouping columns given in the creation of wcc_table. If |
| there are no grouping columns, this column is not created. |
| - component_id : The ID of the component that both the src and dest vertices |
| belong to. |
| - dest : Vertex ID that is reachable from the src vertex. |
| Reachability is computed with regard to a component. |
| |
| </dd> |
| </dl> |
| |
| @anchor count |
| @par Count of Connected Components |
| |
| This function finds the total number of components in the input graph. |
| |
| <pre class="syntax"> |
| graph_wcc_num_cpts( wcc_table, |
| count_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>wcc_table</dt> |
| <dd>TEXT. Name of the table that contains the output of weakly connected |
| components.</dd> |
| |
| <dt>count_table</dt> |
| <dd>TEXT. Name of the output table that contains the total number of components |
| per group in the graph, if there are any grouping_cols in wcc_table. The output |
| table has the following columns: |
| - grouping_cols : The grouping columns given in the creation of wcc_table. |
| If there are no grouping columns, this column is not created, |
| and count is with regard to the entire graph. |
| - num_components : Count of weakly connected components in a graph, or the |
| number of components within a group if grouping_cols is defined. |
| </dd> |
| |
| </dl> |
| |
| @anchor examples |
| @examp |
| |
| -# Create vertex and edge tables to represent the graph: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex, edge; |
| CREATE TABLE vertex( |
| id INTEGER |
| ); |
| CREATE TABLE edge( |
| src INTEGER, |
| dest INTEGER, |
| user_id INTEGER |
| ); |
| INSERT INTO vertex VALUES |
| (0), |
| (1), |
| (2), |
| (3), |
| (4), |
| (5), |
| (6), |
| (10), |
| (11), |
| (12), |
| (13), |
| (14), |
| (15), |
| (16); |
| INSERT INTO edge VALUES |
| (0, 1, 1), |
| (0, 2, 1), |
| (1, 2, 1), |
| (1, 3, 1), |
| (2, 3, 1), |
| (2, 5, 1), |
| (2, 6, 1), |
| (3, 0, 1), |
| (5, 6, 1), |
| (6, 3, 1), |
| (10, 11, 2), |
| (10, 12, 2), |
| (11, 12, 2), |
| (11, 13, 2), |
| (12, 13, 2), |
| (13, 10, 2), |
| (15, 16, 2), |
| (15, 14, 2); |
| </pre> |
| |
| -# Find all the weakly connected components in the graph: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS wcc_out, wcc_out_summary; |
| SELECT madlib.weakly_connected_components( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'wcc_out'); -- Output table of weakly connected components |
| SELECT * FROM wcc_out ORDER BY component_id, id; |
| </pre> |
| <pre class="result"> |
| id | component_id |
| ----+-------------- |
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 5 | 0 |
| 6 | 0 |
| 4 | 4 |
| 10 | 10 |
| 11 | 10 |
| 12 | 10 |
| 13 | 10 |
| 14 | 14 |
| 15 | 14 |
| 16 | 14 |
| (14 rows) |
| </pre> |
| |
| -# Now get the weakly connected components associated with each 'user_id' |
| using the grouping feature: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS wcc_out, wcc_out_summary; |
| SELECT madlib.weakly_connected_components( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'wcc_out', -- Output table of weakly connected components |
| 'user_id'); -- Grouping column name |
| SELECT * FROM wcc_out ORDER BY user_id, component_id, id; |
| </pre> |
| <pre class="result"> |
| id | component_id | user_id |
| ----+--------------+--------- |
| 0 | 0 | 1 |
| 1 | 0 | 1 |
| 2 | 0 | 1 |
| 3 | 0 | 1 |
| 5 | 0 | 1 |
| 6 | 0 | 1 |
| 10 | 10 | 2 |
| 11 | 10 | 2 |
| 12 | 10 | 2 |
| 13 | 10 | 2 |
| 14 | 14 | 2 |
| 15 | 14 | 2 |
| 16 | 14 | 2 |
| (13 rows) |
| </pre> |
| Note that vertex 4 is not identified as a separate component |
| above. This is because there is no entry in the |
| edge table for vertex 4 indicating which group it belongs to |
| (though you could do that if you wanted to). |
| |
| -# Retrieve the largest connected component: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS largest_cpt_table; |
| SELECT madlib.graph_wcc_largest_cpt( |
| 'wcc_out', -- WCC output table |
| 'largest_cpt_table'); -- output table containing largest component ID |
| SELECT * FROM largest_cpt_table ORDER BY component_id; |
| </pre> |
| <pre class="result"> |
| user_id | component_id | num_vertices |
| ---------+--------------+-------------- |
| 1 | 0 | 6 |
| 2 | 10 | 4 |
| (2 rows) |
| </pre> |
| |
| -# Retrieve histogram of the number of vertices per |
| connected component: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS histogram_table; |
| SELECT madlib.graph_wcc_histogram( |
| 'wcc_out', -- WCC output table |
| 'histogram_table'); -- output table containing the histogram of vertices |
| SELECT * FROM histogram_table ORDER BY component_id; |
| </pre> |
| <pre class="result"> |
| user_id | component_id | num_vertices |
| ---------+--------------+-------------- |
| 1 | 0 | 6 |
| 2 | 10 | 4 |
| 2 | 14 | 3 |
| (3 rows) |
| </pre> |
| |
| -# Check if two vertices belong to the same component: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vc_table; |
| SELECT madlib.graph_wcc_vertex_check( |
| 'wcc_out', -- WCC output table |
| '14,15', -- Pair of vertex IDs |
| 'vc_table'); -- output table containing components that contain the two vertices |
| SELECT * FROM vc_table ORDER BY component_id; |
| </pre> |
| <pre class="result"> |
| user_id | component_id |
| ---------+-------------- |
| 2 | 14 |
| (1 row) |
| </pre> |
| |
| -# Retrieve all vertices reachable from a vertex |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS reach_table; |
| SELECT madlib.graph_wcc_reachable_vertices( |
| 'wcc_out', -- WCC output table |
| '0', -- source vertex |
| 'reach_table'); -- output table containing all vertices reachable from source vertex |
| SELECT * FROM reach_table ORDER BY component_id, dest; |
| </pre> |
| <pre class="result"> |
| user_id | component_id | dest |
| ---------+--------------+------ |
| 1 | 0 | 1 |
| 1 | 0 | 2 |
| 1 | 0 | 3 |
| 1 | 0 | 5 |
| 1 | 0 | 6 |
| (5 rows) |
| </pre> |
| |
| -# Count of connected components: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS count_table; |
| SELECT madlib.graph_wcc_num_cpts( |
| 'wcc_out', -- WCC output table |
| 'count_table'); -- output table containing number of components per group |
| SELECT * FROM count_table; |
| </pre> |
| <pre class="result"> |
| user_id | num_components |
| ---------+---------------- |
| 1 | 1 |
| 2 | 2 |
| (2 rows) |
| </pre> |
| |
| @anchor notes |
| @par Notes |
| |
| 1. On a Greenplum cluster, the edge table should be distributed |
| by the source vertex id column for better performance. |
| |
| */ |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| grouping_cols TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, wcc) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.weakly_connected_components($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| -- HELPER functions |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_wcc_largest_cpt( |
| wcc_table TEXT, |
| largest_cpt_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, graph_wcc_largest_cpt) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_wcc_histogram( |
| wcc_table TEXT, |
| histogram_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, graph_wcc_histogram) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_wcc_vertex_check( |
| wcc_table TEXT, |
| vertex_pair TEXT, |
| pair_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, graph_wcc_vertex_check) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_wcc_reachable_vertices( |
| wcc_table TEXT, |
| src TEXT, |
| reachable_vertices_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, graph_wcc_reachable_vertices) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_wcc_num_cpts( |
| wcc_table TEXT, |
| count_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, wcc, graph_wcc_num_cpts) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| ------------------------------------------------------------------------- |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(graph, wcc, wcc_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| ------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.weakly_connected_components() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.weakly_connected_components(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------------- |
| |