blob: 3258e44473bf72e97ba999b1ecb8f2a68d844bbc [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Weakly connected components\n",
"\n",
"Weakly connected components was added in MADlib 1.12. This notebook also includes helper functions based on the WCC output."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
}
],
"source": [
"%load_ext sql"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"u'Connected: gpdbchina@madlib'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Greenplum 4.3.10.0\n",
"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
" \n",
"# PostgreSQL local\n",
"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
"\n",
"# Greenplum 4.2.3.0\n",
"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>version</th>\n",
" </tr>\n",
" <tr>\n",
" <td>MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(u'MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select madlib.version();\n",
"#%sql select version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Create vertex and edge tables"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"Done.\n",
"14 rows affected.\n",
"18 rows affected.\n",
"18 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>src</th>\n",
" <th>dest</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>11</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>12</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>12</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 1, 1),\n",
" (0, 2, 1),\n",
" (1, 2, 1),\n",
" (1, 3, 1),\n",
" (2, 3, 1),\n",
" (2, 5, 1),\n",
" (2, 6, 1),\n",
" (3, 0, 1),\n",
" (5, 6, 1),\n",
" (6, 3, 1),\n",
" (10, 11, 2),\n",
" (10, 12, 2),\n",
" (11, 12, 2),\n",
" (11, 13, 2),\n",
" (12, 13, 2),\n",
" (13, 10, 2),\n",
" (15, 14, 2),\n",
" (15, 16, 2)]"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS vertex, edge;\n",
"\n",
"CREATE TABLE vertex(\n",
" id INTEGER\n",
");\n",
"\n",
"CREATE TABLE edge(\n",
" src INTEGER,\n",
" dest INTEGER,\n",
" user_id INTEGER\n",
");\n",
"\n",
"INSERT INTO vertex VALUES\n",
"(0),\n",
"(1),\n",
"(2),\n",
"(3),\n",
"(4),\n",
"(5),\n",
"(6),\n",
"(10),\n",
"(11),\n",
"(12),\n",
"(13),\n",
"(14),\n",
"(15),\n",
"(16);\n",
"\n",
"INSERT INTO edge VALUES\n",
"(0, 1, 1),\n",
"(0, 2, 1),\n",
"(1, 2, 1),\n",
"(1, 3, 1),\n",
"(2, 3, 1),\n",
"(2, 5, 1),\n",
"(2, 6, 1),\n",
"(3, 0, 1),\n",
"(5, 6, 1),\n",
"(6, 3, 1),\n",
"(10, 11, 2),\n",
"(10, 12, 2),\n",
"(11, 12, 2),\n",
"(11, 13, 2),\n",
"(12, 13, 2),\n",
"(13, 10, 2),\n",
"(15, 16, 2),\n",
"(15, 14, 2);\n",
"\n",
"SELECT * FROM edge ORDER BY src, dest;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Find all weakly connected components"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"14 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>component_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>14</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0),\n",
" (1, 0),\n",
" (2, 0),\n",
" (3, 0),\n",
" (5, 0),\n",
" (6, 0),\n",
" (4, 4),\n",
" (10, 10),\n",
" (11, 10),\n",
" (12, 10),\n",
" (13, 10),\n",
" (14, 14),\n",
" (15, 14),\n",
" (16, 14)]"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n",
"\n",
"SELECT madlib.weakly_connected_components(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'wcc_out'); -- Output table of weakly connected components\n",
"\n",
"SELECT * FROM wcc_out ORDER BY component_id, id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Weakly connected components with grouping"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"13 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>id</th>\n",
" <th>component_id</th>\n",
" <th>user_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>11</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>13</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>14</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>15</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>16</td>\n",
" <td>14</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(0, 0, 1),\n",
" (1, 0, 1),\n",
" (2, 0, 1),\n",
" (3, 0, 1),\n",
" (5, 0, 1),\n",
" (6, 0, 1),\n",
" (10, 10, 2),\n",
" (11, 10, 2),\n",
" (12, 10, 2),\n",
" (13, 10, 2),\n",
" (14, 14, 2),\n",
" (15, 14, 2),\n",
" (16, 14, 2)]"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n",
"\n",
"SELECT madlib.weakly_connected_components(\n",
" 'vertex', -- Vertex table\n",
" 'id', -- Vertix id column\n",
" 'edge', -- Edge table\n",
" 'src=src, dest=dest', -- Comma delimted string of edge arguments\n",
" 'wcc_out', -- Output table of weakly connected components\n",
" 'user_id'); -- Grouping column name\n",
"\n",
"SELECT * FROM wcc_out ORDER BY component_id, id, user_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that vertex '4' is not identified as a separate component in the above result. This is because disconnected nodes cannot be assigned to a particular group with the current graph representation in MADlib."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Largest connected component"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>component_id</th>\n",
" <th>num_vertices</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0, 6L), (2, 10, 4L)]"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"DROP TABLE IF EXISTS largest_cpt_table;\n",
"\n",
"SELECT madlib.graph_wcc_largest_cpt(\n",
" 'wcc_out', -- WCC output table\n",
" 'largest_cpt_table'); -- output table containing largest component ID\n",
"\n",
"SELECT * FROM largest_cpt_table ORDER BY component_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5. Histogram of number of vertices per connected component"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"3 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>component_id</th>\n",
" <th>num_vertices</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0, 6L), (2, 10, 4L), (2, 14, 3L)]"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS histogram_table; \n",
"\n",
"SELECT madlib.graph_wcc_histogram(\n",
" 'wcc_out', -- WCC's output table\n",
" 'histogram_table'); -- output table containing the histogram of vertices\n",
"\n",
"SELECT * FROM histogram_table ORDER BY component_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 6. Check if two vertices belong to the same component"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"1 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>component_id</th>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>14</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(2, 14)]"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS vc_table;\n",
"\n",
"SELECT madlib.graph_wcc_vertex_check(\n",
" 'wcc_out', -- WCC output table\n",
" '14,15', -- Pair of vertex IDs\n",
" 'vc_table'); -- output table containing components that contain the two vertices\n",
"\n",
"SELECT * FROM vc_table ORDER BY component_id;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 7. Find all reachable vertices from a source vertex"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"5 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>component_id</th>\n",
" <th>dest</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 0, 5), (1, 0, 6)]"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS reach_table;\n",
"\n",
"SELECT madlib.graph_wcc_reachable_vertices(\n",
" 'wcc_out', -- WCC output table\n",
" '0', -- source vertex\n",
" 'reach_table'); -- output table containing all vertices reachable from source vertex\n",
"\n",
"SELECT * FROM reach_table ORDER BY component_id, dest;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 8. Count of connected components"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"1 rows affected.\n",
"2 rows affected.\n"
]
},
{
"data": {
"text/html": [
"<table>\n",
" <tr>\n",
" <th>user_id</th>\n",
" <th>num_components</th>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
"</table>"
],
"text/plain": [
"[(1, 1L), (2, 2L)]"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"DROP TABLE IF EXISTS count_table;\n",
"\n",
"SELECT madlib.graph_wcc_num_cpts(\n",
" 'wcc_out', -- WCC output table\n",
" 'count_table'); -- output table containing number of components per group\n",
"\n",
"SELECT * FROM count_table;"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}