blob: becfc4cce6f6b69dc657b43ca385898aa24e4f2c [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PageRank\n",
"The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11.\n",
"\n",
"We also implement personalized PageRank, in which a notion of importance provides personalization to a query. This was added in 1.14"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Greenplum Database 5.x on GCP (PM demo machine) - via tunnel\n",
"%sql postgresql://gpadmin@localhost:8000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.18.0-dev, git revision: rel/v1.17.0-100-g4987e8f, cmake configuration time: Wed Mar 24 23:51:47 UTC 2021, build type: release, build system: Linux-3.10.0-1160.21.1.el7.x86_64, C compiler: gcc 4.8.5, C++ compiler: g++ 4.8.5',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Create vertex and edge tables"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"Done.\n",
"7 rows affected.\n",
"22 rows affected.\n",
"22 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>src</th>\n",
" <th>dest</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 1, 1),\n",
" (0, 2, 2),\n",
" (0, 2, 1),\n",
" (0, 4, 2),\n",
" (0, 4, 1),\n",
" (0, 1, 2),\n",
" (1, 3, 1),\n",
" (1, 3, 2),\n",
" (1, 2, 2),\n",
" (1, 2, 1),\n",
" (2, 5, 1),\n",
" (2, 3, 1),\n",
" (2, 3, 2),\n",
" (2, 6, 1),\n",
" (3, 0, 2),\n",
" (3, 0, 1),\n",
" (4, 0, 1),\n",
" (4, 0, 2),\n",
" (5, 6, 2),\n",
" (5, 6, 1),\n",
" (6, 3, 1),\n",
" (6, 3, 2)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS vertex, edge;\n",
"\n",
"CREATE TABLE vertex(\n",
" id INTEGER\n",
" );\n",
"\n",
"CREATE TABLE edge(\n",
" src INTEGER,\n",
" dest INTEGER,\n",
" user_id INTEGER\n",
" );\n",
"\n",
"INSERT INTO vertex VALUES\n",
"(0),\n",
"(1),\n",
"(2),\n",
"(3),\n",
"(4),\n",
"(5),\n",
"(6);\n",
"\n",
"INSERT INTO edge VALUES\n",
"(0, 1, 1),\n",
"(0, 2, 1),\n",
"(0, 4, 1),\n",
"(1, 2, 1),\n",
"(1, 3, 1),\n",
"(2, 3, 1),\n",
"(2, 5, 1),\n",
"(2, 6, 1),\n",
"(3, 0, 1),\n",
"(4, 0, 1),\n",
"(5, 6, 1),\n",
"(6, 3, 1),\n",
"(0, 1, 2),\n",
"(0, 2, 2),\n",
"(0, 4, 2),\n",
"(1, 2, 2),\n",
"(1, 3, 2),\n",
"(2, 3, 2),\n",
"(3, 0, 2),\n",
"(4, 0, 2),\n",
"(5, 6, 2),\n",
"(6, 3, 2);\n",
"\n",
"SELECT * from edge ORDER BY src;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Calculate PageRank"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>pagerank</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.287518161212</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.210171199451</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.146637377532</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.102910437211</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.102910437211</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.0972746644343</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.0525777229482</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.287518161212111),\n",
" (3, 0.210171199451415),\n",
" (2, 0.146637377532288),\n",
" (4, 0.102910437211324),\n",
" (1, 0.102910437211324),\n",
" (6, 0.0972746644343418),\n",
" (5, 0.0525777229481976)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n",
"\n",
"SELECT madlib.pagerank(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'pagerank_out'); -- Output table of PageRank\n",
"\n",
"SELECT * FROM pagerank_out ORDER BY pagerank DESC;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look at the summary table:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(12,)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM pagerank_out_summary;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now run PageRank with a damping factor of 0.5 which results in different final values:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Calculate PageRank with optional params"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>pagerank</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.225477430556</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.199105076058</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.136259748402</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.132687846189</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.109006420855</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.109006420855</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.088457057085</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.225477430555642),\n",
" (3, 0.199105076058278),\n",
" (2, 0.136259748401728),\n",
" (6, 0.132687846188875),\n",
" (4, 0.109006420855422),\n",
" (1, 0.109006420855422),\n",
" (5, 0.0884570570850217)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n",
"SELECT madlib.pagerank(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'pagerank_out', -- Output table of PageRank\n",
" 0.5); -- Damping factor\n",
"SELECT * FROM pagerank_out ORDER BY pagerank DESC;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Grouping\n",
"Now compute the PageRank distribution separately for each user using the grouping feature:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>id</th>\n",
" <th>pagerank</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.278254883886</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0.201881146671</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0.142881123461</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0.114536378321</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0.100267456154</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0.100267456154</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>0.0619115553529</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0.318546250042</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.237866867733</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0.159148764894</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0.11168334438</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0.11168334438</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0.0396428571429</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>0.0214285714286</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0L, 0.278254883885528),\n",
" (1, 3L, 0.201881146670752),\n",
" (1, 2L, 0.142881123460599),\n",
" (1, 6L, 0.114536378321472),\n",
" (1, 4L, 0.10026745615438),\n",
" (1, 1L, 0.10026745615438),\n",
" (1, 5L, 0.0619115553528898),\n",
" (2, 0L, 0.318546250041731),\n",
" (2, 3L, 0.237866867733431),\n",
" (2, 2L, 0.159148764893974),\n",
" (2, 1L, 0.111683344379718),\n",
" (2, 4L, 0.111683344379718),\n",
" (2, 6L, 0.0396428571428571),\n",
" (2, 5L, 0.0214285714285714)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n",
"\n",
"SELECT madlib.pagerank(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'pagerank_out', -- Output table of PageRank\n",
" NULL, -- Default damping factor (0.85)\n",
" NULL, -- Default max iters (100)\n",
" 0.00000001, -- Threshold\n",
" 'user_id'); -- Grouping column name\n",
"\n",
"SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>27</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>31</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 27), (2, 31)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM pagerank_out_summary ORDER BY user_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Personalized PageRank\n",
"Here we specify {2,4} as the personalization vertices. This parameter could be specified as ARRAY[2,4] as well:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"7 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>pagerank</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0.282616480981</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0.189069710497</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0.177501646133</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>0.15505560795</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0.0800556079496</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0.0743076577868</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0.0401701653568</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0.282616480980651),\n",
" (2, 0.18906971049691),\n",
" (3, 0.177501646133459),\n",
" (4, 0.155055607949638),\n",
" (1, 0.0800556079496381),\n",
" (6, 0.0743076577867786),\n",
" (5, 0.0401701653567887)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n",
"SELECT madlib.pagerank(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'pagerank_out', -- Output table of PageRank\n",
" NULL, -- Default damping factor (0.85)\n",
" NULL, -- Default max iters (100)\n",
" NULL, -- Default Threshold\n",
" NULL, -- No Grouping\n",
" '{2,4}'); -- Personalization vertices\n",
"SELECT * FROM pagerank_out ORDER BY pagerank DESC;"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>__iterations__</th>\n",
" </tr>\n",
" <tr>\n",
" <td>37</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(37,)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"SELECT * FROM pagerank_out_summary;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.16"
}
},
"nbformat": 4,
"nbformat_minor": 1
}