| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file graph.sql_in |
| * |
| * @brief SQL functions for graph analytics |
| * @date Nov 2016 |
| * |
| * @sa Provides various graph algorithms. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_hits |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#hits">HITS</a></li> |
| <li><a href="#notes">Notes</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Find the HITS scores(authority and hub) of all vertices in a directed |
| graph. |
| |
| Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the |
| authority score and hub score of every vertex, where authority estimates the |
| value of the content of the page and hub estimates the value of its links to |
| other pages. This algorithm was originally developed to rate web pages [1]. |
| |
| @anchor hits |
| @par HITS |
| <pre class="syntax"> |
| hits( vertex_table, |
| vertex_id, |
| edge_table, |
| edge_args, |
| out_table, |
| max_iter, |
| threshold, |
| grouping_cols |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>vertex_table</dt> |
| <dd>TEXT. Name of the table containing the vertex data for the graph. Must |
| contain the column specified in the 'vertex_id' parameter below.</dd> |
| |
| <dt>vertex_id</dt> |
| <dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing |
| vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. They |
| do not need to be contiguous.</dd> |
| |
| <dt>edge_table</dt> |
| <dd>TEXT. Name of the table containing the edge data. The edge table must |
| contain columns for source vertex and destination vertex.</dd> |
| |
| <dt>edge_args</dt> |
| <dd>TEXT. A comma-delimited string containing multiple named arguments of |
| the form "name=value". The following parameters are supported for |
| this string argument: |
| - src (INTEGER or BIGINT): Name of the column containing the source vertex ids in |
| the edge table. Default column name is 'src'. |
| - dest (INTEGER or BIGINT): Name of the column containing the destination vertex |
| ids in the edge table. Default column name is 'dest'.</dd> |
| |
| <dt>out_table</dt> |
| <dd>TEXT. Name of the table to store the result of HITS. It will contain |
| a row for every vertex from 'vertex_table' with the following columns: |
| - vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' |
| for column naming. |
| - authority : The vertex authority score. |
| - hub : The vertex hub score. |
| - grouping_cols : Grouping column values (if any) associated with the vertex_id. |
| </dd> |
| |
| A summary table is also created that contains information |
| regarding the number of iterations required for convergence. |
| It is named by adding the suffix '_summary' to the 'out_table' |
| parameter. |
| |
| <dt>max_iter (optional) </dt> |
| <dd>INTEGER, default: 100. The maximum number of iterations allowed. Each |
| iteration consists of both authority and hub phases.</dd> |
| |
| <dt>threshold (optional) </dt> |
| <dd>FLOAT8, default: (1/number of vertices * 1000). |
| Threshold must be set to a value between 0 and 1, inclusive |
| of end points. |
| If the difference between two consecutive iterations of authority AND two |
| consecutive iterations of hub is smaller than 'threshold', then the |
| computation stops. That is, both authority and hub value differences |
| must be below the specified threshold for the algorithm to stop. |
| If you set the threshold to 0, then you will force the |
| algorithm to run for the full number of iterations specified in 'max_iter'. |
| </dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: NULL. A single column or a list of comma-separated columns |
| that divides the input data into discrete groups, resulting in one |
| distribution per group. When this value is NULL, no grouping is used and a |
| single model is generated for all data. |
| @note Expressions are not currently supported for 'grouping_cols'. |
| </dd> |
| |
| </dl> |
| |
| @note On a Greenplum cluster, the edge table should be distributed |
| by the source vertex id column for better performance. |
| |
| @anchor notes |
| @par Notes |
| |
| This algorithm supports multigraph and each duplicated edge is considered |
| for counting when calculating authority and hub scores. |
| |
| @anchor examples |
| @examp |
| |
| -# Create vertex and edge tables to represent the graph: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS vertex, edge; |
| CREATE TABLE vertex( |
| id INTEGER |
| ); |
| CREATE TABLE edge( |
| src INTEGER, |
| dest INTEGER, |
| user_id INTEGER |
| ); |
| INSERT INTO vertex VALUES |
| (0), |
| (1), |
| (2), |
| (3), |
| (4), |
| (5), |
| (6); |
| INSERT INTO edge VALUES |
| (0, 1, 1), |
| (0, 2, 1), |
| (0, 4, 1), |
| (1, 2, 1), |
| (1, 3, 1), |
| (2, 3, 1), |
| (2, 5, 1), |
| (2, 6, 1), |
| (3, 0, 1), |
| (4, 0, 1), |
| (5, 6, 1), |
| (6, 3, 1); |
| </pre> |
| |
| -# Running HITS with default values for optional parameters: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS hits_out, hits_out_summary; |
| SELECT madlib.hits( |
| 'vertex', -- Vertex table |
| 'id', -- Vertex id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimited string of edge arguments |
| 'hits_out'); -- Output table of HITS |
| SELECT * FROM hits_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | authority | hub |
| ----+----------------------+---------------------- |
| 0 | 8.43871829093e-07 | 0.338306115082665 |
| 1 | 0.158459587238244 | 0.527865350448059 |
| 2 | 0.405627969689677 | 0.675800764727558 |
| 3 | 0.721775835521825 | 3.95111934817e-07 |
| 4 | 0.158459587238244 | 3.95111934817e-07 |
| 5 | 0.316385413093048 | 0.189719957843216 |
| 6 | 0.405199928761102 | 0.337944978189241 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM hits_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| -----------------+ |
| 17 |
| (1 row) |
| </pre> |
| -# Running HITS with max_iter of 3 results in different authority and hub |
| scores: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS hits_out, hits_out_summary; |
| SELECT madlib.hits( |
| 'vertex', -- Vertex table |
| 'id', -- Vertex id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimited string of edge arguments |
| 'hits_out', -- Output table |
| 3); -- Max iteration |
| SELECT * FROM hits_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | authority | hub |
| ----+-------------------+-------------------- |
| 0 | 0.08653327387778 | 0.375721659592363 |
| 1 | 0.18388320699029 | 0.533118571043218 |
| 2 | 0.43266636938891 | 0.654974244424525 |
| 3 | 0.70308285025699 | 0.040618557793769 |
| 4 | 0.18388320699029 | 0.040618557793769 |
| 5 | 0.30286645857224 | 0.182783510071961 |
| 6 | 0.38939973245002 | 0.330025782074373 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM hits_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| -----------------+ |
| 3 |
| (1 row) |
| </pre> |
| |
| -# Running HITS with a low threshold of 0.00001 results in more iterations for convergence: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS hits_out, hits_out_summary; |
| SELECT madlib.hits( |
| 'vertex', -- Vertex table |
| 'id', -- Vertex id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimited string of edge arguments |
| 'hits_out', -- Output table |
| NULL, -- Default max_iter |
| 0.00001); -- Threshold |
| SELECT * FROM hits_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | authority | hub |
| ----+----------------------+--------------------- |
| 0 | 1.15243075426e-09 | 0.33800946769422 |
| 1 | 0.158264459912827 | 0.527792117750177 |
| 2 | 0.405384672299625 | 0.675965453766535 |
| 3 | 0.72186275724613 | 5.39583282614e-10 |
| 4 | 0.158264459912827 | 5.39583282614e-10 |
| 5 | 0.316493740997913 | 0.189793242747412 |
| 6 | 0.405356461070609 | 0.337985666133163 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM hits_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| -----------------+ |
| 25 |
| (1 row) |
| </pre> |
| |
| |
| -# Running HITS with both max_iter and threshold: |
| <pre class="syntax"> |
| DROP TABLE IF EXISTS hits_out, hits_out_summary; |
| SELECT madlib.hits( |
| 'vertex', -- Vertex table |
| 'id', -- Vertex id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimited string of edge arguments |
| 'hits_out', -- Output table |
| 20, -- Default max_iter |
| 0.00001); -- Threshold |
| SELECT * FROM hits_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | authority | hub |
| ----+----------------------+--------------------- |
| 0 | 7.11260011825e-08 | 0.33810307986005 |
| 1 | 0.158326035587958 | 0.527815233930963 |
| 2 | 0.405461453180491 | 0.675913495026452 |
| 3 | 0.721835343230399 | 3.33021322089e-08 |
| 4 | 0.158326035587958 | 3.33021322089e-08 |
| 5 | 0.316459563893809 | 0.189770119973925 |
| 6 | 0.405307074424261 | 0.337972831786458 |
| (7 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM hits_out_summary; |
| </pre> |
| <pre class="result"> |
| __iterations__ |
| -----------------+ |
| 20 |
| (1 row) |
| </pre> |
| The algorithm stopped at 20 iterations even though the convergence for threshold |
| of 0.00001 is at 25 iterations. This is because max_iter was set to 20. |
| |
| -# Running HITS with grouping column and default values for max_iter and threshold. |
| Add more rows to the edge table to create different graphs based on the user_id column. |
| <pre class="syntax"> |
| INSERT INTO edge VALUES |
| (0, 1, 2), |
| (0, 2, 2), |
| (0, 4, 2), |
| (1, 2, 2), |
| (1, 3, 2), |
| (2, 3, 2), |
| (3, 0, 2), |
| (4, 0, 2), |
| (5, 6, 2), |
| (6, 3, 2); |
| DROP TABLE IF EXISTS hits_out, hits_out_summary; |
| SELECT madlib.hits( |
| 'vertex', -- Vertex table |
| 'id', -- Vertex id column |
| 'edge', -- Edge table |
| 'src=src, dest=dest', -- Comma delimited string of edge arguments |
| 'hits_out', -- Output table |
| NULL, -- Default max_iter |
| NULL, -- Threshold |
| 'user_id'); -- Grouping column |
| SELECT * FROM hits_out ORDER BY user_id, id; |
| </pre> |
| <pre class="result"> |
| user_id | id | authority | hub |
| ---------+----+----------------------+---------------------- |
| 1 | 0 | 8.43871829093e-07 | 0.338306115082665 |
| 1 | 1 | 0.158459587238244 | 0.527865350448059 |
| 1 | 2 | 0.405627969689677 | 0.675800764727558 |
| 1 | 3 | 0.721775835521825 | 3.95111934817e-07 |
| 1 | 4 | 0.158459587238244 | 3.95111934817e-07 |
| 1 | 5 | 0.316385413093048 | 0.189719957843216 |
| 1 | 6 | 0.405199928761102 | 0.337944978189241 |
| 2 | 0 | 1.60841750444e-05 | 0.632262085114062 |
| 2 | 1 | 0.316079985713431 | 0.632529390899584 |
| 2 | 2 | 0.632364174872359 | 0.316347297480213 |
| 2 | 3 | 0.632694582987791 | 8.04208767442e-06 |
| 2 | 4 | 0.316079985713431 | 8.04208767442e-06 |
| 2 | 5 | 0 | 1.22712519446e-10 |
| 2 | 6 | 2.45425034248e-10 | 0.316347297480213 |
| (14 rows) |
| </pre> |
| <pre class="syntax"> |
| SELECT * FROM hits_out_summary order by user_id; |
| </pre> |
| <pre class="result"> |
| user_id | __iterations__ |
| ---------+---------------- |
| 1 | 17 |
| 2 | 16 |
| (2 rows) |
| </pre> |
| |
| @anchor literature |
| @par Literature |
| |
| [1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked |
| Environment", Journal of the ACM, Sept. 1999. |
| https://www.cs.cornell.edu/home/kleinber/auth.pdf |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| max_iter INTEGER, |
| threshold FLOAT8, |
| grouping_cols VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(graph, hits, hits) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| max_iter INTEGER, |
| threshold FLOAT8 |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, $7, NULL ) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT, |
| max_iter INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, $6, NULL, NULL ) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits( |
| vertex_table TEXT, |
| vertex_id TEXT, |
| edge_table TEXT, |
| edge_args TEXT, |
| out_table TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.hits($1, $2, $3, $4, $5, 100, NULL, NULL) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| ------------------------------------------------------------------------- |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(graph, hits, hits_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| ------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.hits() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.hits(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| ------------------------------------------------------------------------- |