blob: d2d6cfc9bc4c740569314c92f4e6c78dfd00ea5f [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file graph.sql_in
*
* @brief SQL functions for graph analytics
* @date Nov 2016
*
* @sa Provides various graph algorithms.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_hits
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#hits">HITS</a></li>
<li><a href="#notes">Notes</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Find the HITS scores(authority and hub) of all vertices in a directed
graph.
Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the
authority score and hub score of every vertex, where authority estimates the
value of the content of the page and hub estimates the value of its links to
other pages. This algorithm was originally developed to rate web pages [1].
@anchor hits
@par HITS
<pre class="syntax">
hits( vertex_table,
vertex_id,
edge_table,
edge_args,
out_table,
max_iter,
threshold,
grouping_cols
)
</pre>
\b Arguments
<dl class="arglist">
<dt>vertex_table</dt>
<dd>TEXT. Name of the table containing the vertex data for the graph. Must
contain the column specified in the 'vertex_id' parameter below.</dd>
<dt>vertex_id</dt>
<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. They
do not need to be contiguous.</dd>
<dt>edge_table</dt>
<dd>TEXT. Name of the table containing the edge data. The edge table must
contain columns for source vertex and destination vertex.</dd>
<dt>edge_args</dt>
<dd>TEXT. A comma-delimited string containing multiple named arguments of
the form "name=value". The following parameters are supported for
this string argument:
- src (INTEGER or BIGINT): Name of the column containing the source vertex ids in
the edge table. Default column name is 'src'.
- dest (INTEGER or BIGINT): Name of the column containing the destination vertex
ids in the edge table. Default column name is 'dest'.</dd>
<dt>out_table</dt>
<dd>TEXT. Name of the table to store the result of HITS. It will contain
a row for every vertex from 'vertex_table' with the following columns:
- vertex_id : The id of a vertex. Will use the input parameter 'vertex_id'
for column naming.
- authority : The vertex authority score.
- hub : The vertex hub score.
- grouping_cols : Grouping column values (if any) associated with the vertex_id.
</dd>
A summary table is also created that contains information
regarding the number of iterations required for convergence.
It is named by adding the suffix '_summary' to the 'out_table'
parameter.
<dt>max_iter (optional) </dt>
<dd>INTEGER, default: 100. The maximum number of iterations allowed. Each
iteration consists of both authority and hub phases.</dd>
<dt>threshold (optional) </dt>
<dd>FLOAT8, default: (1/number of vertices * 1000).
Threshold must be set to a value between 0 and 1, inclusive
of end points.
If the difference between two consecutive iterations of authority AND two
consecutive iterations of hub is smaller than 'threshold', then the
computation stops. That is, both authority and hub value differences
must be below the specified threshold for the algorithm to stop.
If you set the threshold to 0, then you will force the
algorithm to run for the full number of iterations specified in 'max_iter'.
</dd>
<dt>grouping_cols (optional)</dt>
<dd>TEXT, default: NULL. A single column or a list of comma-separated columns
that divides the input data into discrete groups, resulting in one
distribution per group. When this value is NULL, no grouping is used and a
single model is generated for all data.
@note Expressions are not currently supported for 'grouping_cols'.
</dd>
</dl>
@note On a Greenplum cluster, the edge table should be distributed
by the source vertex id column for better performance.
@anchor notes
@par Notes
This algorithm supports multigraph and each duplicated edge is considered
for counting when calculating authority and hub scores.
@anchor examples
@examp
-# Create vertex and edge tables to represent the graph:
<pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER,
user_id INTEGER
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6);
INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 4, 1),
(1, 2, 1),
(1, 3, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 0, 1),
(4, 0, 1),
(5, 6, 1),
(6, 3, 1);
</pre>
-# Running HITS with default values for optional parameters:
<pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out'); -- Output table of HITS
SELECT * FROM hits_out ORDER BY id;
</pre>
<pre class="result">
id | authority | hub
----+----------------------+----------------------
0 | 8.43871829093e-07 | 0.338306115082665
1 | 0.158459587238244 | 0.527865350448059
2 | 0.405627969689677 | 0.675800764727558
3 | 0.721775835521825 | 3.95111934817e-07
4 | 0.158459587238244 | 3.95111934817e-07
5 | 0.316385413093048 | 0.189719957843216
6 | 0.405199928761102 | 0.337944978189241
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM hits_out_summary;
</pre>
<pre class="result">
__iterations__
-----------------+
17
(1 row)
</pre>
-# Running HITS with max_iter of 3 results in different authority and hub
scores:
<pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
3); -- Max iteration
SELECT * FROM hits_out ORDER BY id;
</pre>
<pre class="result">
id | authority | hub
----+-------------------+--------------------
0 | 0.08653327387778 | 0.375721659592363
1 | 0.18388320699029 | 0.533118571043218
2 | 0.43266636938891 | 0.654974244424525
3 | 0.70308285025699 | 0.040618557793769
4 | 0.18388320699029 | 0.040618557793769
5 | 0.30286645857224 | 0.182783510071961
6 | 0.38939973245002 | 0.330025782074373
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM hits_out_summary;
</pre>
<pre class="result">
__iterations__
-----------------+
3
(1 row)
</pre>
-# Running HITS with a low threshold of 0.00001 results in more iterations for convergence:
<pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
NULL, -- Default max_iter
0.00001); -- Threshold
SELECT * FROM hits_out ORDER BY id;
</pre>
<pre class="result">
id | authority | hub
----+----------------------+---------------------
0 | 1.15243075426e-09 | 0.33800946769422
1 | 0.158264459912827 | 0.527792117750177
2 | 0.405384672299625 | 0.675965453766535
3 | 0.72186275724613 | 5.39583282614e-10
4 | 0.158264459912827 | 5.39583282614e-10
5 | 0.316493740997913 | 0.189793242747412
6 | 0.405356461070609 | 0.337985666133163
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM hits_out_summary;
</pre>
<pre class="result">
__iterations__
-----------------+
25
(1 row)
</pre>
-# Running HITS with both max_iter and threshold:
<pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
20, -- Default max_iter
0.00001); -- Threshold
SELECT * FROM hits_out ORDER BY id;
</pre>
<pre class="result">
id | authority | hub
----+----------------------+---------------------
0 | 7.11260011825e-08 | 0.33810307986005
1 | 0.158326035587958 | 0.527815233930963
2 | 0.405461453180491 | 0.675913495026452
3 | 0.721835343230399 | 3.33021322089e-08
4 | 0.158326035587958 | 3.33021322089e-08
5 | 0.316459563893809 | 0.189770119973925
6 | 0.405307074424261 | 0.337972831786458
(7 rows)
</pre>
<pre class="syntax">
SELECT * FROM hits_out_summary;
</pre>
<pre class="result">
__iterations__
-----------------+
20
(1 row)
</pre>
The algorithm stopped at 20 iterations even though the convergence for threshold
of 0.00001 is at 25 iterations. This is because max_iter was set to 20.
-# Running HITS with grouping column and default values for max_iter and threshold.
Add more rows to the edge table to create different graphs based on the user_id column.
<pre class="syntax">
INSERT INTO edge VALUES
(0, 1, 2),
(0, 2, 2),
(0, 4, 2),
(1, 2, 2),
(1, 3, 2),
(2, 3, 2),
(3, 0, 2),
(4, 0, 2),
(5, 6, 2),
(6, 3, 2);
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
NULL, -- Default max_iter
NULL, -- Threshold
'user_id'); -- Grouping column
SELECT * FROM hits_out ORDER BY user_id, id;
</pre>
<pre class="result">
user_id | id | authority | hub
---------+----+----------------------+----------------------
1 | 0 | 8.43871829093e-07 | 0.338306115082665
1 | 1 | 0.158459587238244 | 0.527865350448059
1 | 2 | 0.405627969689677 | 0.675800764727558
1 | 3 | 0.721775835521825 | 3.95111934817e-07
1 | 4 | 0.158459587238244 | 3.95111934817e-07
1 | 5 | 0.316385413093048 | 0.189719957843216
1 | 6 | 0.405199928761102 | 0.337944978189241
2 | 0 | 1.60841750444e-05 | 0.632262085114062
2 | 1 | 0.316079985713431 | 0.632529390899584
2 | 2 | 0.632364174872359 | 0.316347297480213
2 | 3 | 0.632694582987791 | 8.04208767442e-06
2 | 4 | 0.316079985713431 | 8.04208767442e-06
2 | 5 | 0 | 1.22712519446e-10
2 | 6 | 2.45425034248e-10 | 0.316347297480213
(14 rows)
</pre>
<pre class="syntax">
SELECT * FROM hits_out_summary order by user_id;
</pre>
<pre class="result">
user_id | __iterations__
---------+----------------
1 | 17
2 | 16
(2 rows)
</pre>
@anchor literature
@par Literature
[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked
Environment", Journal of the ACM, Sept. 1999.
https://www.cs.cornell.edu/home/kleinber/auth.pdf
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
max_iter INTEGER,
threshold FLOAT8,
grouping_cols VARCHAR
) RETURNS VOID AS $$
PythonFunction(graph, hits, hits)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
max_iter INTEGER,
threshold FLOAT8
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, $7, NULL )
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT,
max_iter INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, NULL, NULL )
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
vertex_table TEXT,
vertex_id TEXT,
edge_table TEXT,
edge_args TEXT,
out_table TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, 100, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-------------------------------------------------------------------------
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(graph, hits, hits_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.hits('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
-------------------------------------------------------------------------