| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file apsp.sql_in |
| * |
| * @brief SQL functions for graph analytics |
| * @date Nov 2016 |
| * |
| * @sa Provides all pairs shortest path algorithm. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_apsp |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#apsp">APSP</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#notes">Notes</a></li> |
| <li><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Finds the shortest paths between every vertex pair in a given graph. |
| |
| The all pairs shortest paths (APSP) algorithm finds the length (summed weights) |
| of the shortest paths between all pairs of vertices, such that the sum of the |
| weights of the path edges is minimized. |
| |
| @warning APSP is an expensive algorithm for run-time |
| because it finds the shortest path between all nodes |
| in the graph. It is recommended that you start with a |
| small graph to get a sense of run-time for your use case, |
| then increase size carefully from there. The worst case run-time |
| for this implementation is O(V^2 * E) where V is the |
| number of vertices and E is the number of edges. In |
| practice, run-time will be generally be |
| much less than this, but it depends on the graph. |
| On a Greenplum cluster, the edge table should be distributed |
| by the source vertex id column for better performance. |
| |
| @anchor apsp |
| @par APSP |
| <pre class="syntax"> |
| graph_apsp( vertex_table, |
| vertex_id, |
| edge_table, |
| edge_args, |
| out_table, |
| grouping_cols |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>vertex_table</dt> |
| <dd>TEXT. Name of the table containing the vertex data for the graph. Must |
| contain the column specified in the 'vertex_id' parameter below.</dd> |
| |
| <dt>vertex_id</dt> |
| <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing |
| vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. |
| They do not need to be contiguous.</dd> |
| |
| <dt>edge_table</dt> |
| <dd>TEXT. Name of the table containing the edge data. The edge table must |
| contain columns for source vertex, destination vertex and edge weight. |
| Column naming convention is described below in the 'edge_args' parameter.</dd> |
| |
| <dt>edge_args</dt> |
| <dd>TEXT. A comma-delimited string containing multiple named arguments of |
| the form "name=value". The following parameters are supported for |
| this string argument: |
| - src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the |
| edge table. Default column name is 'src'. |
| - dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids |
| in the edge table. Default column name is 'dest'. |
| - weight (FLOAT8): Name of the column containing the edge weights in the |
| edge table. Default column name is 'weight'.</dd> |
| |
| <dt>out_table</dt> |
| <dd>TEXT. Name of the table to store the result of APSP. |
| It contains a row for every vertex of every group and have |
| the following columns (in addition to the grouping columns): |
| - source_vertex: The id for the source vertex. Will use the input edge |
| column 'src' for column naming. |
| - dest_vertex: The id for the destination vertex. Will use the input |
| edge column 'dest' for column naming. |
| - weight: The total weight of the shortest path from the source vertex to |
| the destination vertex. Will use the input parameter 'weight' for column |
| naming. |
| - parent: The parent of the destination vertex in the shortest path from |
| source. Parent will equal dest_vertex if there are no |
| intermediate vertices. Will use 'parent' for column naming. |
| |
| A summary table named <out_table>_summary is also created. This is an internal |
| table that keeps a record of the input parameters and is used by the path |
| retrieval function described below. |
| </dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default = NULL. List of columns used to group the input into discrete |
| subgraphs. These columns must exist in the edge table. When this value is null, |
| no grouping is used and a single APSP result is generated. </dd> |
| </dl> |
| |
| @par Path Retrieval |
| |
| The path retrieval function returns the shortest path from the |
| source vertex to a specified desination vertex. |
| |
| <pre class="syntax"> |
| graph_apsp_get_path( apsp_table, |
| source_vertex, |
| dest_vertex, |
| path_table |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>apsp_table</dt> |
| <dd>TEXT. Name of the table that contains the APSP output.</dd> |
| |
| <dt>source_vertex</dt> |
| <dd>INTEGER or BIGINT. The vertex that will be the source of the desired path.</dd> |
| |
| <dt>dest_vertex</dt> |
| <dd>INTEGER or BIGINT. The vertex that will be the destination of the desired path.</dd> |
| |
| <dt>path_table</dt> |
| <dd>TEXT. Name of the output table that contains the path. |
| It contains a row for every group and has the following columns: |
| - grouping_cols: The grouping columns given in the creation of the APSP |
| table. If there are no grouping columns, these columns will not exist and the |
| table will have a single row. |
| - path (ARRAY): The shortest path from the source vertex to the destination |
| vertex. |
| </dd> |
| |
| </dl> |
| |
| @anchor examples |
| @examp |
| |
| -# Create vertex and edge tables to represent the graph: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex, edge; |
| CREATE TABLE vertex( |
| id INTEGER |
| ); |
| CREATE TABLE edge( |
| src INTEGER, |
| dest INTEGER, |
| weight FLOAT8 |
| ); |
| INSERT INTO vertex VALUES |
| (0), |
| (1), |
| (2), |
| (3), |
| (4), |
| (5), |
| (6), |
| (7); |
| INSERT INTO edge VALUES |
| (0, 1, 1.0), |
| (0, 2, 1.0), |
| (0, 4, 10.0), |
| (1, 2, 2.0), |
| (1, 3, 10.0), |
| (2, 3, 1.0), |
| (2, 5, 1.0), |
| (2, 6, 3.0), |
| (3, 0, 1.0), |
| (4, 0, -2.0), |
| (5, 6, 1.0), |
| (6, 7, 1.0); |
| </pre> |
| |
| -# Calculate the shortest paths: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out, out_summary; |
| SELECT madlib.graph_apsp( |
| 'vertex', -- Vertex table |
| NULL, -- Vertix id column (NULL means use default naming) |
| 'edge', -- Edge table |
| NULL, -- Edge arguments (NULL means use default naming) |
| 'out'); -- Output table of shortest paths |
| SELECT * FROM out ORDER BY src,dest; |
| </pre> |
| <pre class="result"> |
| src | dest | weight | parent |
| -----+------+----------+-------- |
| 0 | 0 | 0 | 0 |
| 0 | 1 | 1 | 1 |
| 0 | 2 | 1 | 2 |
| 0 | 3 | 2 | 2 |
| 0 | 4 | 10 | 4 |
| 0 | 5 | 2 | 2 |
| 0 | 6 | 3 | 5 |
| 0 | 7 | 4 | 6 |
| 1 | 0 | 4 | 3 |
| 1 | 1 | 0 | 1 |
| 1 | 2 | 2 | 2 |
| 1 | 3 | 3 | 2 |
| 1 | 4 | 14 | 0 |
| 1 | 5 | 3 | 2 |
| 1 | 6 | 4 | 5 |
| 1 | 7 | 5 | 6 |
| (showing only 16 of 64 rows) |
| </pre> |
| |
| -# Get the shortest path from vertex 0 to vertex 5: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out_path; |
| SELECT madlib.graph_apsp_get_path('out',0,5,'out_path'); |
| SELECT * FROM out_path; |
| </pre> |
| <pre class="result"> |
| path |
| \--------- |
| {0,2,5} |
| </pre> |
| |
| -# Now let's do a similar example except using |
| different column names in the tables (i.e., not the defaults). |
| Create the vertex and edge tables: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex_alt, edge_alt; |
| CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex; |
| CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM edge; |
| </pre> |
| |
| -# Calculate the shortest paths: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out_alt, out_alt_summary; |
| SELECT madlib.graph_apsp( |
| 'vertex_alt', -- Vertex table |
| 'v_id', -- Vertex id column |
| 'edge_alt', -- Edge table |
| 'src=e_src, weight=e_weight', -- Edge arguments |
| 'out_alt'); -- Output table of shortest paths |
| SELECT * FROM out_alt ORDER BY e_src, dest; |
| </pre> |
| <pre class="result"> |
| e_src | dest | e_weight | parent |
| -------+------+----------+-------- |
| 0 | 0 | 0 | 0 |
| 0 | 1 | 1 | 1 |
| 0 | 2 | 1 | 2 |
| 0 | 3 | 2 | 2 |
| 0 | 4 | 10 | 4 |
| 0 | 5 | 2 | 2 |
| 0 | 6 | 3 | 5 |
| 0 | 7 | 4 | 6 |
| 1 | 0 | 4 | 3 |
| 1 | 1 | 0 | 1 |
| 1 | 2 | 2 | 2 |
| 1 | 3 | 3 | 2 |
| 1 | 4 | 14 | 0 |
| 1 | 5 | 3 | 2 |
| 1 | 6 | 4 | 5 |
| 1 | 7 | 5 | 6 |
| (showing only 16 of 64 rows) |
| </pre> |
| |
| -# Create a graph with 2 groups and find APSP for each group: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS edge_gr; |
| CREATE TABLE edge_gr AS |
| ( |
| SELECT *, 0 AS grp FROM edge |
| UNION |
| SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6 |
| ); |
| INSERT INTO edge_gr VALUES |
| (4,5,-20,1); |
| </pre> |
| |
| -# Find APSP for all groups: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out_gr, out_gr_summary; |
| SELECT madlib.graph_apsp( |
| 'vertex', -- Vertex table |
| NULL, -- Vertex id column (NULL means use default naming) |
| 'edge_gr', -- Edge table |
| NULL, -- Edge arguments (NULL means use default naming) |
| 'out_gr', -- Output table of shortest paths |
| 'grp' -- Grouping columns |
| ); |
| SELECT * FROM out_gr WHERE src < 2 ORDER BY grp,src,dest; |
| </pre> |
| <pre class="result"> |
| grp | src | dest | weight | parent |
| -----+-----+------+--------+-------- |
| 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 1 | 1 | 1 |
| 0 | 0 | 2 | 1 | 2 |
| 0 | 0 | 3 | 2 | 2 |
| 0 | 0 | 4 | 10 | 4 |
| 0 | 0 | 5 | 2 | 2 |
| 0 | 0 | 6 | 4 | 2 |
| 0 | 0 | 7 | 5 | 6 |
| 0 | 1 | 0 | 4 | 3 |
| 0 | 1 | 1 | 0 | 1 |
| 0 | 1 | 2 | 2 | 2 |
| 0 | 1 | 3 | 3 | 2 |
| 0 | 1 | 4 | 14 | 0 |
| 0 | 1 | 5 | 3 | 2 |
| 0 | 1 | 6 | 4 | 5 |
| 0 | 1 | 7 | 5 | 6 |
| 1 | 0 | 0 | 0 | 0 |
| 1 | 0 | 1 | 1 | 1 |
| 1 | 0 | 2 | 1 | 2 |
| 1 | 0 | 3 | 2 | 2 |
| 1 | 0 | 4 | 10 | 4 |
| 1 | 0 | 5 | -10 | 4 |
| 1 | 1 | 0 | 4 | 3 |
| 1 | 1 | 1 | 0 | 1 |
| 1 | 1 | 2 | 2 | 2 |
| 1 | 1 | 3 | 3 | 2 |
| 1 | 1 | 4 | 14 | 0 |
| 1 | 1 | 5 | -6 | 4 |
| (28 rows) |
| </pre> |
| |
| -# Find the path from vertex 0 to vertex 5 in every group |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS out_gr_path; |
| SELECT madlib.graph_apsp_get_path('out_gr',0,5,'out_gr_path'); |
| SELECT * FROM out_gr_path ORDER BY grp; |
| </pre> |
| <pre class="result"> |
| grp | path |
| -----+--------- |
| 0 | {0,2,5} |
| 1 | {0,4,5} |
| </pre> |
| |
| @anchor notes |
| @par Notes |
| |
| 1. Graphs with negative edges are supported but graphs with negative cycles are not. |
| |
| 2. The implementation for APSP is analogous to a matrix multiplication operation. |
| Please refer to the MADlib design document and references [1] and [2] |
| for more details. |
| |
| 3. Also see the Grail project [3] for more background on graph analytics processing |
| in relational databases. |
| |
| @anchor literature |
| @par Literature |
| |
| [1] http://www.columbia.edu/~cs2035/courses/ieor6614.S11/apsp.pdf |
| |
| [2] http://users.cecs.anu.edu.au/~Alistair.Rendell/Teaching/apac_comp3600/module4/all_pairs_shortest_paths.xhtml |
| |
| [3] The case against specialized graph analytics engines, J. Fan, G. Soosai Raj, |
| and J. M. Patel. CIDR 2015. http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf |
| */ |
| |
| ------------------------------------------------------------------------- |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| grouping_cols TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, apsp, graph_apsp) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.graph_apsp($1, $2, $3, $4, $5, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp_get_path( |
| apsp_table TEXT, |
| source_vertex BIGINT, |
| dest_vertex BIGINT, |
| path_table TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, apsp, graph_apsp_get_path) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------------- |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(graph, apsp, graph_apsp_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.graph_apsp(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| -------------------------------------------------------------------------------- |