blob: 0441e916b65e03cf056466ffe67fe604dd3b5b60 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: HITS</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.21.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__hits.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">HITS<div class="ingroups"><a class="el" href="group__grp__graph.html">Graph</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li>
<a href="#hits">HITS</a> </li>
<li>
<a href="#examples">Examples</a> </li>
<li>
<a href="#notes">Notes</a> </li>
<li>
<a href="#literature">Literature</a> </li>
</ul>
</div><p>Given a graph, the HITS (Hyperlink-Induced Topic Search) algorithm outputs the authority score and hub score of every vertex, where authority estimates the value of the content of the page and hub estimates the value of its links to other pages. This algorithm was originally developed to rate web pages [1].</p>
<p><a class="anchor" id="hits"></a></p><dl class="section user"><dt>HITS</dt><dd><pre class="syntax">
hits( vertex_table,
vertex_id,
edge_table,
edge_args,
out_table,
max_iter,
threshold,
grouping_cols
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>vertex_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the vertex data for the graph. Must contain the column specified in the 'vertex_id' parameter below.</p>
<p class="enddd"></p>
</dd>
<dt>vertex_id </dt>
<dd><p class="startdd">TEXT, default = 'id'. Name of the column in 'vertex_table' containing vertex ids. The vertex ids can be of type INTEGER or BIGINT with no duplicates. They do not need to be contiguous.</p>
<p class="enddd"></p>
</dd>
<dt>edge_table </dt>
<dd><p class="startdd">TEXT. Name of the table containing the edge data. The edge table must contain columns for source vertex and destination vertex.</p>
<p class="enddd"></p>
</dd>
<dt>edge_args </dt>
<dd><p class="startdd">TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument:</p><ul>
<li>src (INTEGER or BIGINT): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'.</li>
<li>dest (INTEGER or BIGINT): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</li>
</ul>
<p class="enddd"></p>
</dd>
<dt>out_table </dt>
<dd><p class="startdd">TEXT. Name of the table to store the result of HITS. It will contain a row for every vertex from 'vertex_table' with the following columns:</p><ul>
<li>vertex_id : The id of a vertex. Will use the input parameter 'vertex_id' for column naming.</li>
<li>authority : The vertex authority score.</li>
<li>hub : The vertex hub score.</li>
<li>grouping_cols : Grouping column values (if any) associated with the vertex_id. </li>
</ul>
<p>A summary table is also created that contains information regarding the number of iterations required for convergence. It is named by adding the suffix '_summary' to the 'out_table' parameter.</p>
<p class="enddd"></p>
</dd>
<dt>max_iter (optional) </dt>
<dd><p class="startdd">INTEGER, default: 100. The maximum number of iterations allowed. Each iteration consists of both authority and hub phases.</p>
<p class="enddd"></p>
</dd>
<dt>threshold (optional) </dt>
<dd><p class="startdd">FLOAT8, default: (1/number of vertices * 1000). Threshold must be set to a value between 0 and 1, inclusive of end points. If the difference between two consecutive iterations of authority AND two consecutive iterations of hub is smaller than 'threshold', then the computation stops. That is, both authority and hub value differences must be below the specified threshold for the algorithm to stop. If you set the threshold to 0, then you will force the algorithm to run for the full number of iterations specified in 'max_iter'. </p>
<p class="enddd"></p>
</dd>
<dt>grouping_cols (optional) </dt>
<dd>TEXT, default: NULL. A single column or a list of comma-separated columns that divides the input data into discrete groups, resulting in one distribution per group. When this value is NULL, no grouping is used and a single model is generated for all data. <dl class="section note"><dt>Note</dt><dd>Expressions are not currently supported for 'grouping_cols'. </dd></dl>
</dd>
</dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<ol type="1">
<li>Create vertex and edge tables to represent the graph: <pre class="syntax">
DROP TABLE IF EXISTS vertex, edge;
CREATE TABLE vertex(
id INTEGER
);
CREATE TABLE edge(
src INTEGER,
dest INTEGER,
user_id INTEGER
);
INSERT INTO vertex VALUES
(0),
(1),
(2),
(3),
(4),
(5),
(6);
INSERT INTO edge VALUES
(0, 1, 1),
(0, 2, 1),
(0, 4, 1),
(1, 2, 1),
(1, 3, 1),
(2, 3, 1),
(2, 5, 1),
(2, 6, 1),
(3, 0, 1),
(4, 0, 1),
(5, 6, 1),
(6, 3, 1);
</pre></li>
<li>Running HITS with default values for optional parameters: <pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out'); -- Output table of HITS
SELECT * FROM hits_out ORDER BY id;
</pre> <pre class="result">
id | authority | hub
----+----------------------+----------------------
0 | 8.43871829093e-07 | 0.338306115082665
1 | 0.158459587238244 | 0.527865350448059
2 | 0.405627969689677 | 0.675800764727558
3 | 0.721775835521825 | 3.95111934817e-07
4 | 0.158459587238244 | 3.95111934817e-07
5 | 0.316385413093048 | 0.189719957843216
6 | 0.405199928761102 | 0.337944978189241
(7 rows)
</pre> <pre class="syntax">
SELECT * FROM hits_out_summary;
</pre> <pre class="result">
__iterations__
-----------------+
17
(1 row)
</pre></li>
<li>Running HITS with max_iter of 3 results in different authority and hub scores: <pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
3); -- Max iteration
SELECT * FROM hits_out ORDER BY id;
</pre> <pre class="result">
id | authority | hub
----+-------------------+--------------------
0 | 0.08653327387778 | 0.375721659592363
1 | 0.18388320699029 | 0.533118571043218
2 | 0.43266636938891 | 0.654974244424525
3 | 0.70308285025699 | 0.040618557793769
4 | 0.18388320699029 | 0.040618557793769
5 | 0.30286645857224 | 0.182783510071961
6 | 0.38939973245002 | 0.330025782074373
(7 rows)
</pre> <pre class="syntax">
SELECT * FROM hits_out_summary;
</pre> <pre class="result">
__iterations__
-----------------+
3
(1 row)
</pre></li>
<li>Running HITS with a low threshold of 0.00001 results in more iterations for convergence: <pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
NULL, -- Default max_iter
0.00001); -- Threshold
SELECT * FROM hits_out ORDER BY id;
</pre> <pre class="result">
id | authority | hub
----+----------------------+---------------------
0 | 1.15243075426e-09 | 0.33800946769422
1 | 0.158264459912827 | 0.527792117750177
2 | 0.405384672299625 | 0.675965453766535
3 | 0.72186275724613 | 5.39583282614e-10
4 | 0.158264459912827 | 5.39583282614e-10
5 | 0.316493740997913 | 0.189793242747412
6 | 0.405356461070609 | 0.337985666133163
(7 rows)
</pre> <pre class="syntax">
SELECT * FROM hits_out_summary;
</pre> <pre class="result">
__iterations__
-----------------+
25
(1 row)
</pre></li>
<li>Running HITS with both max_iter and threshold: <pre class="syntax">
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
20, -- Default max_iter
0.00001); -- Threshold
SELECT * FROM hits_out ORDER BY id;
</pre> <pre class="result">
id | authority | hub
----+----------------------+---------------------
0 | 7.11260011825e-08 | 0.33810307986005
1 | 0.158326035587958 | 0.527815233930963
2 | 0.405461453180491 | 0.675913495026452
3 | 0.721835343230399 | 3.33021322089e-08
4 | 0.158326035587958 | 3.33021322089e-08
5 | 0.316459563893809 | 0.189770119973925
6 | 0.405307074424261 | 0.337972831786458
(7 rows)
</pre> <pre class="syntax">
SELECT * FROM hits_out_summary;
</pre> <pre class="result">
__iterations__
-----------------+
20
(1 row)
</pre> The algorithm stopped at 20 iterations even though the convergence for threshold of 0.00001 is at 25 iterations. This is because max_iter was set to 20.</li>
<li>Running HITS with grouping column and default values for max_iter and threshold. Add more rows to the edge table to create different graphs based on the user_id column. <pre class="syntax">
INSERT INTO edge VALUES
(0, 1, 2),
(0, 2, 2),
(0, 4, 2),
(1, 2, 2),
(1, 3, 2),
(2, 3, 2),
(3, 0, 2),
(4, 0, 2),
(5, 6, 2),
(6, 3, 2);
DROP TABLE IF EXISTS hits_out, hits_out_summary;
SELECT madlib.hits(
'vertex', -- Vertex table
'id', -- Vertex id column
'edge', -- Edge table
'src=src, dest=dest', -- Comma delimited string of edge arguments
'hits_out', -- Output table
NULL, -- Default max_iter
NULL, -- Threshold
'user_id'); -- Grouping column
SELECT * FROM hits_out ORDER BY user_id, id;
</pre> <pre class="result">
user_id | id | authority | hub
---------+----+----------------------+----------------------
1 | 0 | 8.43871829093e-07 | 0.338306115082665
1 | 1 | 0.158459587238244 | 0.527865350448059
1 | 2 | 0.405627969689677 | 0.675800764727558
1 | 3 | 0.721775835521825 | 3.95111934817e-07
1 | 4 | 0.158459587238244 | 3.95111934817e-07
1 | 5 | 0.316385413093048 | 0.189719957843216
1 | 6 | 0.405199928761102 | 0.337944978189241
2 | 0 | 1.60841750444e-05 | 0.632262085114062
2 | 1 | 0.316079985713431 | 0.632529390899584
2 | 2 | 0.632364174872359 | 0.316347297480213
2 | 3 | 0.632694582987791 | 8.04208767442e-06
2 | 4 | 0.316079985713431 | 8.04208767442e-06
2 | 5 | 0 | 1.22712519446e-10
2 | 6 | 2.45425034248e-10 | 0.316347297480213
(14 rows)
</pre> <pre class="syntax">
SELECT * FROM hits_out_summary order by user_id;
</pre> <pre class="result">
user_id | __iterations__
---------+----------------
1 | 17
2 | 16
(2 rows)
</pre></li>
</ol>
<p><a class="anchor" id="notes"></a></p><dl class="section user"><dt>Notes</dt><dd></dd></dl>
<ol type="1">
<li>On a Greenplum cluster, the edge table should be distributed by the source vertex id column for better performance.</li>
<li>This implementation of the HITS algorithm supports multigraph and each duplicated edge is considered for counting when calculating authority and hub scores.</li>
</ol>
<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] Kleinerg, Jon M., "Authoritative Sources in a Hyperlinked
Environment", Journal of the ACM, Sept. 1999. <a href="https://www.cs.cornell.edu/home/kleinber/auth.pdf">https://www.cs.cornell.edu/home/kleinber/auth.pdf</a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Feb 23 2023 19:26:40 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>