src/ports/postgres/modules/graph/pagerank.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *
  * @file graph.sql_in
  *
  * @brief SQL functions for graph analytics
  * @date Nov 2016
  *
  * @sa Provides various graph algorithms.
  *
  *//* ----------------------------------------------------------------------- */
 m4_include(`SQLCommon.m4')


 /**
 @addtogroup grp_pagerank

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#pagerank">PageRank</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#notes">Notes</a></li>
 <li><a href="#literature">Literature</a></li>
 </ul>
 </div>

 @brief Find the PageRank of all vertices in a directed graph.

 Given a graph, the PageRank algorithm outputs a probability distribution representing the
 likelihood that a person randomly traversing the graph will arrive at any particular vertex.
 This algorithm was originally used by Google to rank websites where the World Wide Web was
 modeled as a directed graph with the vertices representing the websites.  The PageRank
 algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1].

 We also implement personalized PageRank, in which a notion of importance
 provides personalization to a query.
 For example, importance scores can be biased according
 to a specified set of vertices in the graph that are of interest or special in some way [2].

 @anchor pagerank
 @par PageRank
 <pre class="syntax">
 pagerank( vertex_table,
           vertex_id,
           edge_table,
           edge_args,
           out_table,
           damping_factor,
           max_iter,
           threshold,
           grouping_cols,
           personalization_vertices
         )
 </pre>

 \b Arguments
 <dl class="arglist">
 <dt>vertex_table</dt>
 <dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
 column specified in the 'vertex_id' parameter below.</dd>

 <dt>vertex_id</dt>
 <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
 vertex ids.  The vertex ids can be of type INTEGER or BIGINT with no duplicates.
 They do not need to be contiguous.</dd>

 <dt>edge_table</dt>
 <dd>TEXT. Name of the table containing the edge data. The edge table must
 contain columns for source vertex and destination vertex.</dd>

 <dt>edge_args</dt>
 <dd>TEXT. A comma-delimited string containing multiple named arguments of
 the form "name=value". The following parameters are supported for
 this string argument:
   - src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table.
                    Default column name is 'src'.
   - dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table.
                     Default column name is 'dest'.</dd>

 <dt>out_table</dt>
 <dd>TEXT. Name of the table to store the result of PageRank.
 It will contain a row for every vertex from 'vertex_table' with
 the following columns:
   - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming.
   - pagerank : The vertex's PageRank.
   - grouping_cols : Grouping column (if any) values associated with the vertex_id.</dd>

 A summary table is also created that contains information
 regarding the number of iterations required for convergence.
 It is named by adding the suffix '_summary' to the 'out_table'
 parameter.

 <dt>damping_factor (optional)</dt>
 <dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd>

 <dt>max_iter (optional)</dt>
 <dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>

 <dt>threshold (optional)</dt>
 <dd>FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive
 iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the
 computation stops.  If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'.
 It is advisable to set threshold to a value lower than 1/(number of vertices in the graph) since the PageRank value of nodes is initialized to that
 value.</dd>

 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: NULL. A single column or a list of comma-separated
 columns that divides the input data into discrete groups, resulting in one
 distribution per group. When this value is NULL, no grouping is used and
 a single model is generated for all data.
 @note Expressions are not currently supported for 'grouping_cols'.</dd>

 <dt> personalization_vertices (optional)</dt>
 <dd>INTEGER[] or BIGINT[], default: NULL. A comma separated list of vertices or nodes
 for personalized PageRank. When this parameter is provided, personalized PageRank
 will run.  In the absence of this parameter, regular PageRank will run.
 </dl>

 @anchor examples
 @examp

 -# Create vertex and edge tables to represent the graph:
 <pre class="syntax">
 DROP TABLE IF EXISTS vertex, edge;
 CREATE TABLE vertex(
         id INTEGER
         );
 CREATE TABLE edge(
         src INTEGER,
         dest INTEGER,
         user_id INTEGER
         );
 INSERT INTO vertex VALUES
 (0),
 (1),
 (2),
 (3),
 (4),
 (5),
 (6);
 INSERT INTO edge VALUES
 (0, 1, 1),
 (0, 2, 1),
 (0, 4, 1),
 (1, 2, 1),
 (1, 3, 1),
 (2, 3, 1),
 (2, 5, 1),
 (2, 6, 1),
 (3, 0, 1),
 (4, 0, 1),
 (5, 6, 1),
 (6, 3, 1),
 (0, 1, 2),
 (0, 2, 2),
 (0, 4, 2),
 (1, 2, 2),
 (1, 3, 2),
 (2, 3, 2),
 (3, 0, 2),
 (4, 0, 2),
 (5, 6, 2),
 (6, 3, 2);
 </pre>

 -# Running PageRank with default values for optional parameters:
 <pre class="syntax">
 DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                        'vertex',             -- Vertex table
                        'id',                 -- Vertix id column
                        'edge',               -- Edge table
                        'src=src, dest=dest', -- Comma delimted string of edge arguments
                        'pagerank_out');      -- Output table of PageRank
 SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
  id |      pagerank
 ----+-------------------
   0 |  0.28753749341184
   3 |  0.21016988901855
   2 |  0.14662683454062
   4 |  0.10289614384217
   1 |  0.10289614384217
   6 |  0.09728637768887
   5 |  0.05258711765692
 (7 rows)
 </pre>
 <pre class="syntax">
 SELECT * FROM pagerank_out_summary;
 </pre>
 <pre class="result">
  __iterations__
  ----------------+
              16
 (1 row)
 </pre>

 -# Running PageRank with a damping factor of 0.5 results in different final values:
 <pre class="syntax">
 DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                          'vertex',             -- Vertex table
                          'id',                 -- Vertix id column
                          'edge',               -- Edge table
                          'src=src, dest=dest', -- Comma delimted string of edge arguments
                          'pagerank_out',       -- Output table of PageRank
                          0.5);                 -- Damping factor
 SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
  id |      pagerank
 ----+--------------------
   0 |  0.225477161441199
   3 |  0.199090328586664
   2 |  0.136261327206477
   6 |  0.132691559968224
   4 |  0.109009291409508
   1 |  0.109009291409508
   5 | 0.0884610399788161
 (7 rows)
 </pre>

 -# Now compute the PageRank of vertices associated with each user
 using the grouping feature:
 <pre class="syntax">
 DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                          'vertex',             -- Vertex table
                          'id',                 -- Vertix id column
                          'edge',               -- Edge table
                          'src=src, dest=dest', -- Comma delimted string of edge arguments
                          'pagerank_out',       -- Output table of PageRank
                          NULL,                 -- Default damping factor (0.85)
                          NULL,                 -- Default max iters (100)
                          0.00000001,           -- Threshold
                          'user_id');           -- Grouping column name
 SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;
 </pre>
 <pre class="result">
  user_id | id |      pagerank
 ---------+----+--------------------
        1 |  0 |  0.27825488388552
        1 |  3 |  0.20188114667075
        1 |  2 |  0.14288112346059
        1 |  6 |  0.11453637832147
        1 |  1 |  0.10026745615438
        1 |  4 |  0.10026745615438
        1 |  5 |  0.06191155535288
        2 |  0 |  0.31854625004173
        2 |  3 |  0.23786686773343
        2 |  2 |  0.15914876489397
        2 |  1 |  0.11168334437971
        2 |  4 |  0.11168334437971
        2 |  6 |  0.03964285714285
        2 |  5 |  0.02142857142857
 (14 rows)
 </pre>
 <pre class="syntax">
 SELECT * FROM pagerank_out_summary ORDER BY user_id;
 </pre>
 <pre class="result">
  user_id | __iterations__
 ---------+----------------
        1 |             27
        2 |             31
 (2 rows)
 </pre>

 -# Personalized PageRank. Here we specify {2,4}
 as the personalization vertices. This parameter
 could be specified as ARRAY[2,4] as well.
 <pre class="syntax">
 DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
 SELECT madlib.pagerank(
                        'vertex',             -- Vertex table
                        'id',                 -- Vertix id column
                        'edge',               -- Edge table
                        'src=src, dest=dest', -- Comma delimted string of edge arguments
                        'pagerank_out',       -- Output table of PageRank
                         NULL,                -- Default damping factor (0.85)
                         NULL,                -- Default max iters (100)
                         NULL,                -- Default Threshold
                         NULL,                -- No Grouping
                        '{2,4}');             -- Personalization vertices
 SELECT * FROM pagerank_out ORDER BY pagerank DESC;
 </pre>
 <pre class="result">
  id |      pagerank
 ----+--------------------
   0 |  0.565232961966315
   2 |  0.378139420991773
   3 |  0.355003292266017
   4 |  0.310111215897626
   1 |  0.160111215897626
   6 |  0.148615315574136
   5 | 0.0803403307142321
 (7 rows)
 </pre>
 <pre class="syntax">
 SELECT * FROM pagerank_out_summary;
 </pre>
 <pre class="result">
  __iterations__
  ----------------+
              37
 (1 row)
 </pre>

 @anchor notes
 @par Notes

 1. On a Greenplum cluster, the edge table should be distributed
 by the source vertex id column for better performance.

 @anchor literature
 @par Literature

 [1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine",
 Computer Networks and ISDN Systems. 30: 107–117,
 http://infolab.stanford.edu/pub/papers/google.pdf

 [2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search",
 Proceedings of the 12th international conference on World Wide Web, Pages 271-279
 Budapest, Hungary, May 20-24, 2003,
 http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf
 */

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8,
     max_iter        INTEGER,
     threshold       FLOAT8,
     grouping_cols   VARCHAR,
     personalization_vertices BIGINT[]
 ) RETURNS VOID AS $$
     PythonFunction(graph, pagerank, pagerank)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8,
     max_iter        INTEGER,
     threshold       FLOAT8,
     grouping_cols   VARCHAR
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8,
     max_iter        INTEGER,
     threshold       FLOAT8
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8,
     max_iter        INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT,
     damping_factor  FLOAT8
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, NULL, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     vertex_table    TEXT,
     vertex_id       TEXT,
     edge_table      TEXT,
     edge_args       TEXT,
     out_table       TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, NULL, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------

 -- Online help
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
     message VARCHAR
 ) RETURNS VARCHAR AS $$
     PythonFunction(graph, pagerank, pagerank_help)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');

 --------------------------------------------------------------------------------

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank()
 RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.pagerank('');
 $$ LANGUAGE sql IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 --------------------------------------------------------------------------------
	/* ----------------------------------------------------------------------- //*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*
	* @file graph.sql_in
	*
	* @brief SQL functions for graph analytics
	* @date Nov 2016
	*
	* @sa Provides various graph algorithms.
	*
	// ----------------------------------------------------------------------- */
	m4_include(`SQLCommon.m4')


	/**
	@addtogroup grp_pagerank

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#pagerank">PageRank</a></li>
	<li><a href="#examples">Examples</a></li>
	<li><a href="#notes">Notes</a></li>
	<li><a href="#literature">Literature</a></li>
	</ul>
	</div>

	@brief Find the PageRank of all vertices in a directed graph.

	Given a graph, the PageRank algorithm outputs a probability distribution representing the
	likelihood that a person randomly traversing the graph will arrive at any particular vertex.
	This algorithm was originally used by Google to rank websites where the World Wide Web was
	modeled as a directed graph with the vertices representing the websites. The PageRank
	algorithm initially proposed by Larry Page and Sergey Brin is implemented here [1].

	We also implement personalized PageRank, in which a notion of importance
	provides personalization to a query.
	For example, importance scores can be biased according
	to a specified set of vertices in the graph that are of interest or special in some way [2].

	@anchor pagerank
	@par PageRank
	<pre class="syntax">
	pagerank( vertex_table,
	vertex_id,
	edge_table,
	edge_args,
	out_table,
	damping_factor,
	max_iter,
	threshold,
	grouping_cols,
	personalization_vertices
	)
	</pre>

	\b Arguments
	<dl class="arglist">
	<dt>vertex_table</dt>
	<dd>TEXT. Name of the table containing the vertex data for the graph. Must contain the
	column specified in the 'vertex_id' parameter below.</dd>

	<dt>vertex_id</dt>
	<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
	vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates.
	They do not need to be contiguous.</dd>

	<dt>edge_table</dt>
	<dd>TEXT. Name of the table containing the edge data. The edge table must
	contain columns for source vertex and destination vertex.</dd>

	<dt>edge_args</dt>
	<dd>TEXT. A comma-delimited string containing multiple named arguments of
	the form "name=value". The following parameters are supported for
	this string argument:
	- src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table.
	Default column name is 'src'.
	- dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table.
	Default column name is 'dest'.</dd>

	<dt>out_table</dt>
	<dd>TEXT. Name of the table to store the result of PageRank.
	It will contain a row for every vertex from 'vertex_table' with
	the following columns:
	- vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming.
	- pagerank : The vertex's PageRank.
	- grouping_cols : Grouping column (if any) values associated with the vertex_id.</dd>

	A summary table is also created that contains information
	regarding the number of iterations required for convergence.
	It is named by adding the suffix '_summary' to the 'out_table'
	parameter.

	<dt>damping_factor (optional)</dt>
	<dd>FLOAT8, default 0.85. The probability, at any step, that a user will continue following the links in a random surfer model.</dd>

	<dt>max_iter (optional)</dt>
	<dd>INTEGER, default: 100. The maximum number of iterations allowed.</dd>

	<dt>threshold (optional)</dt>
	<dd>FLOAT8, default: (1/number of vertices * 1000). If the difference between the PageRank of every vertex of two consecutive
	iterations is smaller than 'threshold', or the iteration number is larger than 'max_iter', the
	computation stops. If you set the threshold to zero, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'.
	It is advisable to set threshold to a value lower than 1/(number of vertices in the graph) since the PageRank value of nodes is initialized to that
	value.</dd>

	<dt>grouping_cols (optional)</dt>
	<dd>TEXT, default: NULL. A single column or a list of comma-separated
	columns that divides the input data into discrete groups, resulting in one
	distribution per group. When this value is NULL, no grouping is used and
	a single model is generated for all data.
	@note Expressions are not currently supported for 'grouping_cols'.</dd>

	<dt> personalization_vertices (optional)</dt>
	<dd>INTEGER[] or BIGINT[], default: NULL. A comma separated list of vertices or nodes
	for personalized PageRank. When this parameter is provided, personalized PageRank
	will run. In the absence of this parameter, regular PageRank will run.
	</dl>

	@anchor examples
	@examp

	-# Create vertex and edge tables to represent the graph:
	<pre class="syntax">
	DROP TABLE IF EXISTS vertex, edge;
	CREATE TABLE vertex(
	id INTEGER
	);
	CREATE TABLE edge(
	src INTEGER,
	dest INTEGER,
	user_id INTEGER
	);
	INSERT INTO vertex VALUES
	(0),
	(1),
	(2),
	(3),
	(4),
	(5),
	(6);
	INSERT INTO edge VALUES
	(0, 1, 1),
	(0, 2, 1),
	(0, 4, 1),
	(1, 2, 1),
	(1, 3, 1),
	(2, 3, 1),
	(2, 5, 1),
	(2, 6, 1),
	(3, 0, 1),
	(4, 0, 1),
	(5, 6, 1),
	(6, 3, 1),
	(0, 1, 2),
	(0, 2, 2),
	(0, 4, 2),
	(1, 2, 2),
	(1, 3, 2),
	(2, 3, 2),
	(3, 0, 2),
	(4, 0, 2),
	(5, 6, 2),
	(6, 3, 2);
	</pre>

	-# Running PageRank with default values for optional parameters:
	<pre class="syntax">
	DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
	SELECT madlib.pagerank(
	'vertex', -- Vertex table
	'id', -- Vertix id column
	'edge', -- Edge table
	'src=src, dest=dest', -- Comma delimted string of edge arguments
	'pagerank_out'); -- Output table of PageRank
	SELECT * FROM pagerank_out ORDER BY pagerank DESC;
	</pre>
	<pre class="result">
	id \| pagerank
	----+-------------------
	0 \| 0.28753749341184
	3 \| 0.21016988901855
	2 \| 0.14662683454062
	4 \| 0.10289614384217
	1 \| 0.10289614384217
	6 \| 0.09728637768887
	5 \| 0.05258711765692
	(7 rows)
	</pre>
	<pre class="syntax">
	SELECT * FROM pagerank_out_summary;
	</pre>
	<pre class="result">
	__iterations__
	----------------+
	16
	(1 row)
	</pre>

	-# Running PageRank with a damping factor of 0.5 results in different final values:
	<pre class="syntax">
	DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
	SELECT madlib.pagerank(
	'vertex', -- Vertex table
	'id', -- Vertix id column
	'edge', -- Edge table
	'src=src, dest=dest', -- Comma delimted string of edge arguments
	'pagerank_out', -- Output table of PageRank
	0.5); -- Damping factor
	SELECT * FROM pagerank_out ORDER BY pagerank DESC;
	</pre>
	<pre class="result">
	id \| pagerank
	----+--------------------
	0 \| 0.225477161441199
	3 \| 0.199090328586664
	2 \| 0.136261327206477
	6 \| 0.132691559968224
	4 \| 0.109009291409508
	1 \| 0.109009291409508
	5 \| 0.0884610399788161
	(7 rows)
	</pre>

	-# Now compute the PageRank of vertices associated with each user
	using the grouping feature:
	<pre class="syntax">
	DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
	SELECT madlib.pagerank(
	'vertex', -- Vertex table
	'id', -- Vertix id column
	'edge', -- Edge table
	'src=src, dest=dest', -- Comma delimted string of edge arguments
	'pagerank_out', -- Output table of PageRank
	NULL, -- Default damping factor (0.85)
	NULL, -- Default max iters (100)
	0.00000001, -- Threshold
	'user_id'); -- Grouping column name
	SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;
	</pre>
	<pre class="result">
	user_id \| id \| pagerank
	---------+----+--------------------
	1 \| 0 \| 0.27825488388552
	1 \| 3 \| 0.20188114667075
	1 \| 2 \| 0.14288112346059
	1 \| 6 \| 0.11453637832147
	1 \| 1 \| 0.10026745615438
	1 \| 4 \| 0.10026745615438
	1 \| 5 \| 0.06191155535288
	2 \| 0 \| 0.31854625004173
	2 \| 3 \| 0.23786686773343
	2 \| 2 \| 0.15914876489397
	2 \| 1 \| 0.11168334437971
	2 \| 4 \| 0.11168334437971
	2 \| 6 \| 0.03964285714285
	2 \| 5 \| 0.02142857142857
	(14 rows)
	</pre>
	<pre class="syntax">
	SELECT * FROM pagerank_out_summary ORDER BY user_id;
	</pre>
	<pre class="result">
	user_id \| __iterations__
	---------+----------------
	1 \| 27
	2 \| 31
	(2 rows)
	</pre>

	-# Personalized PageRank. Here we specify {2,4}
	as the personalization vertices. This parameter
	could be specified as ARRAY[2,4] as well.
	<pre class="syntax">
	DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;
	SELECT madlib.pagerank(
	'vertex', -- Vertex table
	'id', -- Vertix id column
	'edge', -- Edge table
	'src=src, dest=dest', -- Comma delimted string of edge arguments
	'pagerank_out', -- Output table of PageRank
	NULL, -- Default damping factor (0.85)
	NULL, -- Default max iters (100)
	NULL, -- Default Threshold
	NULL, -- No Grouping
	'{2,4}'); -- Personalization vertices
	SELECT * FROM pagerank_out ORDER BY pagerank DESC;
	</pre>
	<pre class="result">
	id \| pagerank
	----+--------------------
	0 \| 0.565232961966315
	2 \| 0.378139420991773
	3 \| 0.355003292266017
	4 \| 0.310111215897626
	1 \| 0.160111215897626
	6 \| 0.148615315574136
	5 \| 0.0803403307142321
	(7 rows)
	</pre>
	<pre class="syntax">
	SELECT * FROM pagerank_out_summary;
	</pre>
	<pre class="result">
	__iterations__
	----------------+
	37
	(1 row)
	</pre>

	@anchor notes
	@par Notes

	1. On a Greenplum cluster, the edge table should be distributed
	by the source vertex id column for better performance.

	@anchor literature
	@par Literature

	[1] Brin, S. and Page, L. (1998), "The anatomy of a large-scale hypertextual Web search engine",
	Computer Networks and ISDN Systems. 30: 107–117,
	http://infolab.stanford.edu/pub/papers/google.pdf

	[2] Jeh, Glen and Widom, Jennifer. "Scaling Personalized Web Search",
	Proceedings of the 12th international conference on World Wide Web, Pages 271-279
	Budapest, Hungary, May 20-24, 2003,
	http://ilpubs.stanford.edu:8090/530/1/2002-12.pdf
	*/

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT,
	damping_factor FLOAT8,
	max_iter INTEGER,
	threshold FLOAT8,
	grouping_cols VARCHAR,
	personalization_vertices BIGINT[]
	) RETURNS VOID AS $$
	PythonFunction(graph, pagerank, pagerank)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT,
	damping_factor FLOAT8,
	max_iter INTEGER,
	threshold FLOAT8,
	grouping_cols VARCHAR
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL)
	$$ LANGUAGE SQL
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT,
	damping_factor FLOAT8,
	max_iter INTEGER,
	threshold FLOAT8
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, $8, NULL, NULL)
	$$ LANGUAGE SQL
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT,
	damping_factor FLOAT8,
	max_iter INTEGER
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, $7, NULL, NULL, NULL)
	$$ LANGUAGE SQL
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT,
	damping_factor FLOAT8
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, $6, 100, NULL, NULL, NULL)
	$$ LANGUAGE SQL
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	vertex_table TEXT,
	vertex_id TEXT,
	edge_table TEXT,
	edge_args TEXT,
	out_table TEXT
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.pagerank($1, $2, $3, $4, $5, 0.85, 100, NULL, NULL, NULL)
	$$ LANGUAGE SQL
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	-------------------------------------------------------------------------

	-- Online help
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank(
	message VARCHAR
	) RETURNS VARCHAR AS $$
	PythonFunction(graph, pagerank, pagerank_help)
	$$ LANGUAGE plpythonu IMMUTABLE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');

	--------------------------------------------------------------------------------

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pagerank()
	RETURNS VARCHAR AS $$
	SELECT MADLIB_SCHEMA.pagerank('');
	$$ LANGUAGE sql IMMUTABLE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
	--------------------------------------------------------------------------------