blob: 40fef57d3c1d518eeceaa06f5c2b7ee55c8edc4f [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# HITS (Hyperlink-Induced Topic Search) \n",
"Outputs the authority score and hub score of every vertex, where authority estimates the value of the content of the page and hub estimates the value of its links to other pages. Added in MADlib 1.13."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpdbchina@madlib'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum 4.3.10.0\n",
"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-31-g4b7d9cc, cmake configuration time: Tue Nov 21 22:31:28 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Create vertex and edge tables"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"Done.\n",
"7 rows affected.\n",
"12 rows affected.\n",
"12 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>src</th>\n",
" <th>dest</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 1, 1),\n",
" (0, 2, 1),\n",
" (0, 4, 1),\n",
" (1, 2, 1),\n",
" (1, 3, 1),\n",
" (2, 3, 1),\n",
" (2, 5, 1),\n",
" (2, 6, 1),\n",
" (3, 0, 1),\n",
" (4, 0, 1),\n",
" (5, 6, 1),\n",
" (6, 3, 1)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vertex, edge;\n",
"\n",
"CREATE TABLE vertex(\n",
" id INTEGER\n",
" );\n",
"\n",
"CREATE TABLE edge(\n",
" src INTEGER,\n",
" dest INTEGER,\n",
" user_id INTEGER\n",
" );\n",
"\n",
"INSERT INTO vertex VALUES\n",
"(0),\n",
"(1),\n",
"(2),\n",
"(3),\n",
"(4),\n",
"(5),\n",
"(6);\n",
"\n",
"INSERT INTO edge VALUES\n",
"(0, 1, 1),\n",
"(0, 2, 1),\n",
"(0, 4, 1),\n",
"(1, 2, 1),\n",
"(1, 3, 1),\n",
"(2, 3, 1),\n",
"(2, 5, 1),\n",
"(2, 6, 1),\n",
"(3, 0, 1),\n",
"(4, 0, 1),\n",
"(5, 6, 1),\n",
"(6, 3, 1);\n",
"\n",
"SELECT * from edge ORDER BY src, dest;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Compute the HITS scores"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>8.43871829093e-07</td>\n",
" <td>0.338306115083</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.158459587238</td>\n",
" <td>0.527865350448</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.40562796969</td>\n",
" <td>0.675800764728</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.721775835522</td>\n",
" <td>3.95111934817e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.158459587238</td>\n",
" <td>3.95111934817e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.316385413093</td>\n",
" <td>0.189719957843</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.405199928761</td>\n",
" <td>0.337944978189</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 8.4387182909347e-07, 0.338306115082665),\n",
" (1, 0.158459587238244, 0.527865350448059),\n",
" (2, 0.405627969689677, 0.675800764727558),\n",
" (3, 0.721775835521825, 3.95111934817447e-07),\n",
" (4, 0.158459587238244, 3.95111934817447e-07),\n",
" (5, 0.316385413093048, 0.189719957843216),\n",
" (6, 0.405199928761102, 0.337944978189241)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out'); -- Output table of HITS\n",
"\n",
"SELECT * FROM hits_out ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>17</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(17,)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM hits_out_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Different number of iterations\n",
"Results in different hub and authority scores."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.0865332738778</td>\n",
" <td>0.375721659592</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.18388320699</td>\n",
" <td>0.533118571043</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.432666369389</td>\n",
" <td>0.654974244425</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.703082850257</td>\n",
" <td>0.0406185577938</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.18388320699</td>\n",
" <td>0.0406185577938</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.302866458572</td>\n",
" <td>0.182783510072</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.38939973245</td>\n",
" <td>0.330025782074</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.0865332738777835, 0.375721659592363),\n",
" (1, 0.18388320699029, 0.533118571043218),\n",
" (2, 0.432666369388918, 0.654974244424525),\n",
" (3, 0.703082850256991, 0.040618557793769),\n",
" (4, 0.18388320699029, 0.040618557793769),\n",
" (5, 0.302866458572242, 0.182783510071961),\n",
" (6, 0.389399732450026, 0.330025782074373)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out', -- Output table of HITS\n",
" 3); -- Max iteration\n",
"\n",
"SELECT * FROM hits_out ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the summary table: "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(3,)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM hits_out_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Different theshold\n",
"Running HITS with a low threshold of 0.00001 results in more iterations for convergence"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.194028500029</td>\n",
" <td>0.39062401003</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.194028500029</td>\n",
" <td>0.528491307688</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.436564125065</td>\n",
" <td>0.643380722403</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.679099750102</td>\n",
" <td>0.0919115317719</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.194028500029</td>\n",
" <td>0.0919115317719</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.291042750044</td>\n",
" <td>0.183823063544</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.388057000058</td>\n",
" <td>0.321690361202</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.194028500029066, 0.390624010030425),\n",
" (1, 0.194028500029066, 0.528491307688221),\n",
" (2, 0.436564125065399, 0.643380722403052),\n",
" (3, 0.679099750101732, 0.0919115317718646),\n",
" (4, 0.194028500029066, 0.0919115317718646),\n",
" (5, 0.2910427500436, 0.183823063543729),\n",
" (6, 0.388057000058133, 0.321690361201526)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out', -- Output table of HITS\n",
" NULL, -- Default max_iter\n",
" 0.5); -- Threshold\n",
"\n",
"SELECT * FROM hits_out ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(2,)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM hits_out_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Different number of iterations and threshold"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>7.11260011826e-08</td>\n",
" <td>0.33810307986</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.158326035588</td>\n",
" <td>0.527815233931</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.40546145318</td>\n",
" <td>0.675913495026</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.72183534323</td>\n",
" <td>3.33021322089e-08</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.158326035588</td>\n",
" <td>3.33021322089e-08</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.316459563894</td>\n",
" <td>0.189770119974</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.405307074424</td>\n",
" <td>0.337972831786</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 7.11260011825861e-08, 0.33810307986005),\n",
" (1, 0.158326035587958, 0.527815233930963),\n",
" (2, 0.405461453180491, 0.675913495026452),\n",
" (3, 0.721835343230399, 3.3302132208914e-08),\n",
" (4, 0.158326035587958, 3.3302132208914e-08),\n",
" (5, 0.316459563893809, 0.189770119973925),\n",
" (6, 0.405307074424261, 0.337972831786458)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out', -- Output table\n",
" 20, -- Default max_iter\n",
" 0.00001); -- Threshold\n",
"\n",
"SELECT * FROM hits_out ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the summary table. The algorithm stopped at 20 iterations even though the convergence for threshold of 0.00001 is at 25 iterations. This is because max_iter was set to 20."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>20</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(20,)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM hits_out_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Grouping\n",
"Running HITS with grouping column and default values for max_iter and threshold. Add more rows to the edge table to create different graphs based on the user_id column."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 rows affected.\n",
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8.43871829093e-07</td>\n",
" <td>0.338306115083</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0.158459587238</td>\n",
" <td>0.527865350448</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0.40562796969</td>\n",
" <td>0.675800764728</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0.721775835522</td>\n",
" <td>3.95111934817e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0.158459587238</td>\n",
" <td>3.95111934817e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>0.316385413093</td>\n",
" <td>0.189719957843</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0.405199928761</td>\n",
" <td>0.337944978189</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1.60841750445e-05</td>\n",
" <td>0.632262085114</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0.316079985713</td>\n",
" <td>0.6325293909</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0.632364174872</td>\n",
" <td>0.31634729748</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.632694582988</td>\n",
" <td>8.04208767443e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0.316079985713</td>\n",
" <td>8.04208767443e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>0.0</td>\n",
" <td>1.22712519446e-10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>2.45425034248e-10</td>\n",
" <td>0.31634729748</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0, 8.4387182909347e-07, 0.338306115082665),\n",
" (1, 1, 0.158459587238244, 0.527865350448059),\n",
" (1, 2, 0.405627969689677, 0.675800764727558),\n",
" (1, 3, 0.721775835521825, 3.95111934817447e-07),\n",
" (1, 4, 0.158459587238244, 3.95111934817447e-07),\n",
" (1, 5, 0.316385413093048, 0.189719957843216),\n",
" (1, 6, 0.405199928761102, 0.337944978189241),\n",
" (2, 0, 1.60841750444904e-05, 0.632262085114062),\n",
" (2, 1, 0.316079985713431, 0.632529390899584),\n",
" (2, 2, 0.632364174872359, 0.316347297480213),\n",
" (2, 3, 0.632694582987791, 8.04208767442759e-06),\n",
" (2, 4, 0.316079985713431, 8.04208767442759e-06),\n",
" (2, 5, 0.0, 1.22712519446222e-10),\n",
" (2, 6, 2.45425034248205e-10, 0.316347297480213)]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"INSERT INTO edge VALUES\n",
"(0, 1, 2),\n",
"(0, 2, 2),\n",
"(0, 4, 2),\n",
"(1, 2, 2),\n",
"(1, 3, 2),\n",
"(2, 3, 2),\n",
"(3, 0, 2),\n",
"(4, 0, 2),\n",
"(5, 6, 2),\n",
"(6, 3, 2);\n",
"\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out', -- Output table\n",
" NULL, -- Default max_iter\n",
" NULL, -- Threshold\n",
" 'user_id'); -- Grouping column\n",
"\n",
"SELECT * FROM hits_out ORDER BY user_id, id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 7. Other\n",
"Let's check against the output from p. 8 of http://www.cis.hut.fi/Opinnot/T-61.6020/2008/pagerank_hits.pdf"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"Done.\n",
"4 rows affected.\n",
"6 rows affected.\n",
"6 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>src</th>\n",
" <th>dest</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 1, 1), (0, 2, 1), (0, 3, 1), (1, 2, 1), (1, 3, 1), (2, 1, 1)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vertex, edge;\n",
"\n",
"CREATE TABLE vertex(\n",
" id INTEGER\n",
" );\n",
"\n",
"CREATE TABLE edge(\n",
" src INTEGER,\n",
" dest INTEGER,\n",
" user_id INTEGER\n",
" );\n",
"\n",
"INSERT INTO vertex VALUES\n",
"(0),\n",
"(1),\n",
"(2),\n",
"(3);\n",
"\n",
"INSERT INTO edge VALUES\n",
"(0, 1, 1),\n",
"(0, 2, 1),\n",
"(0, 3, 1),\n",
"(1, 2, 1),\n",
"(1, 3, 1),\n",
"(2, 1, 1);\n",
"\n",
"SELECT * from edge ORDER BY src, dest;"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>16</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 17), (2, 16)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM hits_out_summary order by user_id;"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"4 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>authority</th>\n",
" <th>hub</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.788680749581</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.459746429928</td>\n",
" <td>0.577334927798</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.627946343316</td>\n",
" <td>0.211345821783</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.627946343316</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.0, 0.788680749580922),\n",
" (1, 0.459746429927966, 0.577334927797799),\n",
" (2, 0.627946343316246, 0.211345821783123),\n",
" (3, 0.627946343316246, 0.0)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS hits_out, hits_out_summary;\n",
"\n",
"SELECT madlib.hits(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertex id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimited string of edge arguments\n",
" 'hits_out', -- Output table of HITS\n",
" 100); -- Max iteration\n",
"\n",
"SELECT * FROM hits_out ORDER BY id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Yes ^^^ matches the results from the reference."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}