blob: 9adf8447fc98b290bcb7d22d5dbca757b07c7832 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file graph.sql_in
*
* @brief SQL functions for graph analytics
* @date Nov 2016
*
* @sa Provides various graph algorithms.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_pagerank
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#pagerank">PageRank</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Find the PageRank of all vertices in a directed graph.
Given a graph, the PageRank algorithm outputs a probability distribution representing the
likelihood that a person randomly traversing the graph will arrive at any particular vertex.
This algorithm was originally used by Google to rank websites where the World Wide Web was
modeled as a directed graph with the vertices representing the websites. The PageRank
algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1].
We also implement personalized PageRank, in which a notion of importance
provides personalization to a query.
For example, importance scores can be biased according
to a specified set of vertices in the graph that are of interest or special in some way [2].
@anchor pagerank
@par PageRank
<pre class="syntax">
pagerank( vertex_table,
vertex_id,
edge_table,
edge_args,
out_table,
damping_factor,
max_iter,
threshold,
grouping_cols,
personalization_vertices
)
</pre>
\b Arguments
<dl class="arglist">
<dt>vertex_table</dt>
<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
column specified in the 'vertex_id' parameter below.</dd>
<dt>vertex_id</dt>
<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates.
They do not need to be contiguous.</dd>
<dt>edge_table</dt>
<dd>TEXT. Name of the table containing the edge data. The edge table must
contain columns for source vertex and destination vertex.</dd>
<dt>edge_args</dt>
<dd>TEXT. A comma-delimited string containing multiple named arguments of
the form "name=value". The following parameters are supported for
this string argument:
- src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table.
Default column name is 'src'.
- dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table.
Default column name is 'dest'.</dd>
<dt>out_table</dt>
<dd>TEXT. Name of the table to store the result of PageRank.
It will contain a row for every vertex from 'vertex_table' with
the following columns:
- vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming.
- pagerank : The vertex's PageRank.
- grouping_cols : Grouping column (if any) values associated with the vertex_id.</dd>
A summary table is also created that contains information
regarding the number of iterations required for convergence.
It is named by adding the suffix '_summary' to the 'out_table'
parameter.
<dt>damping_factor (optional)</dt>
<dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd>
<dt>max_iter (optional)</dt>
<dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>
<dt>threshold (optional)</dt>
<dd>FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive
iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the
computation stops. If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'.
It is advisable to set threshold to a value lower than 1/(number of vertices in the graph) since the PageRank value of nodes is initialized to that
value.</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default: NULL. A single column or a list of comma-separated
columns that divides the input data into discrete groups, resulting in one
distribution per group. When this value is NULL, no grouping is used and
a single model is generated for all data.
@note Expressions are not currently supported for 'grouping_cols'.</dd>
<dt> personalization_vertices (optional)</dt>
<dd>INTEGER[] or BIGINT[], default: NULL. A comma separated list of vertices or nodes
for personalized PageRank. When this parameter is provided, personalized PageRank
will run. In the absence of this parameter, regular PageRank will run.
</dl>
@anchor examples
@examp
-# Create vertex and edge tables to represent the graph:
<pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER,
user_id INTEGER
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6);
INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 4, 1),
(1, 2, 1),
(1, 3, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 0, 1),
(4, 0, 1),
(5, 6, 1),
(6, 3, 1),
(0, 1, 2),
(0, 2, 2),
(0, 4, 2),
(1, 2, 2),
(1, 3, 2),
(2, 3, 2),
(3, 0, 2),
(4, 0, 2),
(5, 6, 2),
(6, 3, 2);
</pre>
-# Running PageRank with default values for optional parameters:
<pre class="syntax">
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
SELECT madlib.pagerank(
'vertex', -- Vertex table
'id', -- Vertix id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimted string of edge arguments
'pagerank_out'); -- Output table of PageRank
SELECT * FROM pagerank_out ORDER BY pagerank DESC;
</pre>
<pre class="result">
id | pagerank
----+-------------------
0 | 0.28753749341184
3 | 0.21016988901855
2 | 0.14662683454062
4 | 0.10289614384217
1 | 0.10289614384217
6 | 0.09728637768887
5 | 0.05258711765692
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM pagerank_out_summary;
</pre>
<pre class="result">
__iterations__
----------------+
16
(1 row)
</pre>
-# Running PageRank with a damping factor of 0.5 results in different final values:
<pre class="syntax">
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
SELECT madlib.pagerank(
'vertex', -- Vertex table
'id', -- Vertix id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimted string of edge arguments
'pagerank_out', -- Output table of PageRank
0.5); -- Damping factor
SELECT * FROM pagerank_out ORDER BY pagerank DESC;
</pre>
<pre class="result">
id | pagerank
----+--------------------
0 | 0.225477161441199
3 | 0.199090328586664
2 | 0.136261327206477
6 | 0.132691559968224
4 | 0.109009291409508
1 | 0.109009291409508
5 | 0.0884610399788161
(7 rows)
</pre>
-# Now compute the PageRank of vertices associated with each user
using the grouping feature:
<pre class="syntax">
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
SELECT madlib.pagerank(
'vertex', -- Vertex table
'id', -- Vertix id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimted string of edge arguments
'pagerank_out', -- Output table of PageRank
NULL, -- Default damping factor (0.85)
NULL, -- Default max iters (100)
0.00000001, -- Threshold
'user_id'); -- Grouping column name
SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;
</pre>
<pre class="result">
user_id | id | pagerank
---------+----+--------------------
1 | 0 | 0.27825488388552
1 | 3 | 0.20188114667075
1 | 2 | 0.14288112346059
1 | 6 | 0.11453637832147
1 | 1 | 0.10026745615438
1 | 4 | 0.10026745615438
1 | 5 | 0.06191155535288
2 | 0 | 0.31854625004173
2 | 3 | 0.23786686773343
2 | 2 | 0.15914876489397
2 | 1 | 0.11168334437971
2 | 4 | 0.11168334437971
2 | 6 | 0.03964285714285
2 | 5 | 0.02142857142857
(14 rows)
</pre>
<pre class="syntax">
SELECT * FROM pagerank_out_summary ORDER BY user_id;
</pre>
<pre class="result">
user_id | __iterations__
---------+----------------
1 | 27
2 | 31
(2 rows)
</pre>
-# Personalized PageRank. Here we specify {2,4}
as the personalization vertices. This parameter
could be specified as ARRAY[2,4] as well.
<pre class="syntax">
DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
SELECT madlib.pagerank(
'vertex', -- Vertex table
'id', -- Vertix id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimted string of edge arguments
'pagerank_out', -- Output table of PageRank
NULL, -- Default damping factor (0.85)
NULL, -- Default max iters (100)
NULL, -- Default Threshold
NULL, -- No Grouping
'{2,4}'); -- Personalization vertices
SELECT * FROM pagerank_out ORDER BY pagerank DESC;
</pre>
<pre class="result">
id | pagerank
----+--------------------
0 | 0.565232961966315
2 | 0.378139420991773
3 | 0.355003292266017
4 | 0.310111215897626
1 | 0.160111215897626
6 | 0.148615315574136
5 | 0.0803403307142321
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM pagerank_out_summary;
</pre>
<pre class="result">
__iterations__
----------------+
37
(1 row)
</pre>
@anchor notes
@par Notes
1. On a Greenplum cluster, the edge table should be distributed
by the source vertex id column for better performance.
@anchor literature
@par Literature
[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine",
Computer Networks and ISDN Systems. 30: 107–117,
http://infolab.stanford.edu/pub/papers/google.pdf
[2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search",
Proceedings of the 12th international conference on World Wide Web, Pages 271-279
Budapest, Hungary, May 20-24, 2003,
http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
damping_factor FLOAT8,
max_iter INTEGER,
threshold FLOAT8,
grouping_cols VARCHAR,
personalization_vertices BIGINT[]
) RETURNS VOID AS $$
PythonFunction(graph, pagerank, pagerank)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
damping_factor FLOAT8,
max_iter INTEGER,
threshold FLOAT8,
grouping_cols VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
damping_factor FLOAT8,
max_iter INTEGER,
threshold FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
damping_factor FLOAT8,
max_iter INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
damping_factor FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, NULL, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, NULL, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(graph, pagerank, pagerank_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
--------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.pagerank('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
--------------------------------------------------------------------------------