| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file graph.sql_in |
| * |
| * @brief SQL functions for graph analytics |
| * @date Nov 2016 |
| * |
| * @sa Provides various graph algorithms. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_pagerank |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#pagerank">PageRank</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Find the PageRank of all vertices in a directed graph. |
| |
| Given a graph, the PageRank algorithm outputs a probability distribution representing the |
| likelihood that a person randomly traversing the graph will arrive at any particular vertex. |
| This algorithm was originally used by Google to rank websites where the World Wide Web was |
| modeled as a directed graph with the vertices representing the websites. The PageRank |
| algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1]. |
| |
| We also implement personalized PageRank, in which a notion of importance |
| provides personalization to a query. |
| For example, importance scores can be biased according |
| to a specified set of vertices in the graph that are of interest or special in some way [2]. |
| |
| @anchor pagerank |
| @par PageRank |
| <pre class="syntax"> |
| pagerank( vertex_table, |
| vertex_id, |
| edge_table, |
| edge_args, |
| out_table, |
| damping_factor, |
| max_iter, |
| threshold, |
| grouping_cols, |
| personalization_vertices |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>vertex_table</dt> |
| <dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the |
| column specified in the 'vertex_id' parameter below.</dd> |
| |
| <dt>vertex_id</dt> |
| <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing |
| vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. |
| They do not need to be contiguous.</dd> |
| |
| <dt>edge_table</dt> |
| <dd>TEXT. Name of the table containing the edge data. The edge table must |
| contain columns for source vertex and destination vertex.</dd> |
| |
| <dt>edge_args</dt> |
| <dd>TEXT. A comma-delimited string containing multiple named arguments of |
| the form "name=value". The following parameters are supported for |
| this string argument: |
| - src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table. |
| Default column name is 'src'. |
| - dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table. |
| Default column name is 'dest'.</dd> |
| |
| <dt>out_table</dt> |
| <dd>TEXT. Name of the table to store the result of PageRank. |
| It will contain a row for every vertex from 'vertex_table' with |
| the following columns: |
| - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming. |
| - pagerank : The vertex's PageRank. |
| - grouping_cols : Grouping column (if any) values associated with the vertex_id.</dd> |
| |
| A summary table is also created that contains information |
| regarding the number of iterations required for convergence. |
| It is named by adding the suffix '_summary' to the 'out_table' |
| parameter. |
| |
| <dt>damping_factor (optional)</dt> |
| <dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd> |
| |
| <dt>max_iter (optional)</dt> |
| <dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd> |
| |
| <dt>threshold (optional)</dt> |
| <dd>FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive |
| iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the |
| computation stops. If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'. |
| It is advisable to set threshold to a value lower than 1/(number of vertices in the graph) since the PageRank value of nodes is initialized to that |
| value.</dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: NULL. A single column or a list of comma-separated |
| columns that divides the input data into discrete groups, resulting in one |
| distribution per group. When this value is NULL, no grouping is used and |
| a single model is generated for all data. |
| @note Expressions are not currently supported for 'grouping_cols'.</dd> |
| |
| <dt> personalization_vertices (optional)</dt> |
| <dd>INTEGER[] or BIGINT[], default: NULL. A comma separated list of vertices or nodes |
| for personalized PageRank. When this parameter is provided, personalized PageRank |
| will run. In the absence of this parameter, regular PageRank will run. |
| </dl> |
| |
| @note On a Greenplum cluster, the edge table should be distributed |
| by the source vertex id column for better performance. |
| |
| @anchor examples |
| @examp |
| |
| -# Create vertex and edge tables to represent the graph: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex, edge; |
| CREATE TABLE vertex( |
| id INTEGER |
| ); |
| CREATE TABLE edge( |
| src INTEGER, |
| dest INTEGER, |
| user_id INTEGER |
| ); |
| INSERT INTO vertex VALUES |
| (0), |
| (1), |
| (2), |
| (3), |
| (4), |
| (5), |
| (6); |
| INSERT INTO edge VALUES |
| (0, 1, 1), |
| (0, 2, 1), |
| (0, 4, 1), |
| (1, 2, 1), |
| (1, 3, 1), |
| (2, 3, 1), |
| (2, 5, 1), |
| (2, 6, 1), |
| (3, 0, 1), |
| (4, 0, 1), |
| (5, 6, 1), |
| (6, 3, 1), |
| (0, 1, 2), |
| (0, 2, 2), |
| (0, 4, 2), |
| (1, 2, 2), |
| (1, 3, 2), |
| (2, 3, 2), |
| (3, 0, 2), |
| (4, 0, 2), |
| (5, 6, 2), |
| (6, 3, 2); |
| </pre> |
| |
| -# Running PageRank with default values for optional parameters: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary; |
| SELECT madlib.pagerank( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'pagerank_out'); -- Output table of PageRank |
| SELECT * FROM pagerank_out ORDER BY pagerank DESC; |
| </pre> |
| <pre class="result"> |
| id | pagerank |
| ----+------------------- |
| 0 | 0.28753749341184 |
| 3 | 0.21016988901855 |
| 2 | 0.14662683454062 |
| 4 | 0.10289614384217 |
| 1 | 0.10289614384217 |
| 6 | 0.09728637768887 |
| 5 | 0.05258711765692 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM pagerank_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| ----------------+ |
| 16 |
| (1 row) |
| </pre> |
| |
| -# Running PageRank with a damping factor of 0.5 results in different final values: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary; |
| SELECT madlib.pagerank( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'pagerank_out', -- Output table of PageRank |
| 0.5); -- Damping factor |
| SELECT * FROM pagerank_out ORDER BY pagerank DESC; |
| </pre> |
| <pre class="result"> |
| id | pagerank |
| ----+-------------------- |
| 0 | 0.225477161441199 |
| 3 | 0.199090328586664 |
| 2 | 0.136261327206477 |
| 6 | 0.132691559968224 |
| 4 | 0.109009291409508 |
| 1 | 0.109009291409508 |
| 5 | 0.0884610399788161 |
| (7 rows) |
| </pre> |
| |
| -# Now compute the PageRank of vertices associated with each user |
| using the grouping feature: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary; |
| SELECT madlib.pagerank( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'pagerank_out', -- Output table of PageRank |
| NULL, -- Default damping factor (0.85) |
| NULL, -- Default max iters (100) |
| 0.00000001, -- Threshold |
| 'user_id'); -- Grouping column name |
| SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC; |
| </pre> |
| <pre class="result"> |
| user_id | id | pagerank |
| ---------+----+-------------------- |
| 1 | 0 | 0.27825488388552 |
| 1 | 3 | 0.20188114667075 |
| 1 | 2 | 0.14288112346059 |
| 1 | 6 | 0.11453637832147 |
| 1 | 1 | 0.10026745615438 |
| 1 | 4 | 0.10026745615438 |
| 1 | 5 | 0.06191155535288 |
| 2 | 0 | 0.31854625004173 |
| 2 | 3 | 0.23786686773343 |
| 2 | 2 | 0.15914876489397 |
| 2 | 1 | 0.11168334437971 |
| 2 | 4 | 0.11168334437971 |
| 2 | 6 | 0.03964285714285 |
| 2 | 5 | 0.02142857142857 |
| (14 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM pagerank_out_summary ORDER BY user_id; |
| </pre> |
| <pre class="result"> |
| user_id | __iterations__ |
| ---------+---------------- |
| 1 | 27 |
| 2 | 31 |
| (2 rows) |
| </pre> |
| |
| -# Personalized PageRank. Here we specify {2,4} |
| as the personalization vertices. This parameter |
| could be specified as ARRAY[2,4] as well. |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary; |
| SELECT madlib.pagerank( |
| 'vertex', -- Vertex table |
| 'id', -- Vertix id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimted string of edge arguments |
| 'pagerank_out', -- Output table of PageRank |
| NULL, -- Default damping factor (0.85) |
| NULL, -- Default max iters (100) |
| NULL, -- Default Threshold |
| NULL, -- No Grouping |
| '{2,4}'); -- Personalization vertices |
| SELECT * FROM pagerank_out ORDER BY pagerank DESC; |
| </pre> |
| <pre class="result"> |
| id | pagerank |
| ----+-------------------- |
| 0 | 0.565232961966315 |
| 2 | 0.378139420991773 |
| 3 | 0.355003292266017 |
| 4 | 0.310111215897626 |
| 1 | 0.160111215897626 |
| 6 | 0.148615315574136 |
| 5 | 0.0803403307142321 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM pagerank_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| ----------------+ |
| 37 |
| (1 row) |
| </pre> |
| |
| @anchor literature |
| @par Literature |
| |
| [1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine", |
| Computer Networks and ISDN Systems. 30: 107–117, |
| http://infolab.stanford.edu/pub/papers/google.pdf |
| |
| [2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search", |
| Proceedings of the 12th international conference on World Wide Web, Pages 271-279 |
| Budapest, Hungary, May 20-24, 2003, |
| http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| damping_factor FLOAT8, |
| max_iter INTEGER, |
| threshold FLOAT8, |
| grouping_cols VARCHAR, |
| personalization_vertices BIGINT[] |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, pagerank, pagerank) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| damping_factor FLOAT8, |
| max_iter INTEGER, |
| threshold FLOAT8, |
| grouping_cols VARCHAR |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| damping_factor FLOAT8, |
| max_iter INTEGER, |
| threshold FLOAT8 |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| damping_factor FLOAT8, |
| max_iter INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| damping_factor FLOAT8 |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, NULL, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, NULL, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(graph, pagerank, pagerank_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.pagerank(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| -------------------------------------------------------------------------------- |