| % When using TeXShop on the Mac, let it know the root document. The following |
| % must be one of the first 20 lines. |
| % !TEX root = ../design.tex |
| |
| % Licensed to the Apache Software Foundation (ASF) under one |
| % or more contributor license agreements. See the NOTICE file |
| % distributed with this work for additional information |
| % regarding copyright ownership. The ASF licenses this file |
| % to you under the Apache License, Version 2.0 (the |
| % "License"); you may not use this file except in compliance |
| % with the License. You may obtain a copy of the License at |
| |
| % http://www.apache.org/licenses/LICENSE-2.0 |
| |
| % Unless required by applicable law or agreed to in writing, |
| % software distributed under the License is distributed on an |
| % "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| % KIND, either express or implied. See the License for the |
| % specific language governing permissions and limitations |
| % under the License. |
| |
| \chapter[Graph]{Graph} |
| |
| \begin{moduleinfo} |
| \item[Authors] \href{mailto:okislal@pivotal.io}{Orhan Kislal}, \href{mailto:njayaram@pivotal.io}{Nandish Jayaram}, |
| \href{mailto:rraghu@pivotal.io}{Rashmi Raghu}, \href{mailto:jmei@pivotal.io@pivotal.io}{Jingyi Mei}, |
| \href{mailto:nkak@pivotal.io}{Nikhil Kak} |
| \item[History] |
| \begin{modulehistory} |
| \item[v0.1] Initial version, SSSP only. |
| \item[v0.2] Graph Framework, SSSP implementation details. |
| \item[v0.3] PageRank |
| \item[v0.4] APSP |
| \item[v0.5] Weakly Connected Components |
| \item[v0.6] Breadth First Search (BFS) |
| \item[v0.7] Hyperlink-Induced Topic Search (HITS) |
| \end{modulehistory} |
| \end{moduleinfo} |
| |
| |
| % Abstract. What is the problem we want to solve? |
| |
| This module implements various graph algorithms that are used in a number of |
| applications such as social networks, telecommunications and road networks. |
| |
| \section{Graph Framework} \label{sec:graph:fw} |
| |
| MADlib graph representation depends on two structures, a \emph{vertex} table |
| and an \emph{edge} table. The vertex table has to have a column of vertex ids. |
| The edge table has to have 2 columns: source vertex id, destination vertex id. |
| For most algorithms an edge weight column is required as well. The |
| representation assumes a directed graph, an edge from $x$ to $y$ does |
| \emph{not} guarantee the existence of an edge from $y$ to $x$. Both of the |
| tables may have additional columns as required. Multi-edges (multiple edges |
| from a vertex to the same destination) and loops (edge from a vertex to |
| itself) are allowed. This representation does not impose any ordering of |
| vertices or edges. An example graph is given in Figure~\ref{sssp:example} and |
| its representative tables are given in Table~\ref{sssp:rep}. |
| |
| \begin{figure}[h] |
| \centering |
| \includegraphics[width=0.9\textwidth]{figures/graph_example.pdf} |
| \caption{A sample graph} |
| \label{sssp:example} |
| \end{figure} |
| |
| \begin{table} |
| \begin{tabular}{| c | } |
| \hline |
| vid \\ \hline |
| 0 \\ \hline |
| 1 \\ \hline |
| 2 \\ \hline |
| 3 \\ \hline |
| 4 \\ \hline |
| 5 \\ \hline |
| 6 \\ \hline |
| 7 \\ |
| \hline |
| \end{tabular} |
| \quad |
| \begin{tabular}{| c | c | c |} |
| \hline |
| src & dest & weight \\ \hline |
| 0 & 1 & 1 \\ \hline |
| 0 & 2 & 1 \\ \hline |
| 0 & 4 & 10 \\ \hline |
| 1 & 2 & 2 \\ \hline |
| 1 & 3 & 10 \\ \hline |
| 1 & 5 & 1 \\ \hline |
| 2 & 3 & 1 \\ \hline |
| 2 & 5 & 1 \\ \hline |
| 2 & 6 & 3 \\ \hline |
| 3 & 0 & 1 \\ \hline |
| 5 & 6 & 1 \\ \hline |
| 6 & 7 & 1 \\ |
| \hline |
| \end{tabular} |
| \caption{Graph representation of vertices (left) and edges(right) in the |
| database} |
| \label{sssp:rep} |
| \end{table} |
| |
| |
| \section{Single Source Shortest Path} \label{sec:graph:sssp} |
| |
| Given a graph and a source vertex, single source shortest path (SSSP) |
| algorithm finds a path for every vertex such that the sum of the weights of |
| its constituent edges is minimized. |
| |
| Shortest path is defined as follows. Let $e_{i,j}$ be the edge from vertex $i$ |
| to vertex $j$ and $w_{i,j}$ be its weight. Given a graph G, the shortest path |
| from $s$ to $d$ is $P = (v_1, v_2 \dots, v_n)$ (where $v_1=s$ and $v_n=d$) |
| that over all possible $n$ minimizes the sum $ \sum _{i=1}^{n-1}f(e_{i,i+1})$. |
| |
| % \subsection{Bellman Ford Algorithm} |
| |
| Bellman-Ford Algorithm \cite{bellman1958routing,ford1956network} is based on |
| the following idea: We start with a naive approximation for the cost of |
| reaching every vertex. At each iteration, these values are refined based on |
| the edge list and the existing approximations. If there are no refinements at |
| any given step, the algorithm returns the calculated results. If the algorithm |
| does not converge in $|V|-1$ iterations, this indicates the existence of a |
| negative cycle in the graph. |
| |
| |
| \begin{algorithm}[SSSP$(V,E,start)$] \label{alg:sssp} |
| \alginput{Vertex set $V$, edge set $E$, starting vertex $start$} |
| \algoutput{Distance and parent set for every vertex $cur$} |
| \begin{algorithmic}[1] |
| \State $toupdate(0) \set (start,0,start)$ |
| \For{every $i \in 0\dots|V|-1$} |
| \For{every tuple $t \in toupdate(i)$} \label{alg:sssp:update} |
| \For{every edge $e \mid e.src = t.id$} |
| \State $local \set e.val + t.val$ |
| \If{$local < toupdate(i+1,e.dest).val$} \label{alg:sssp:single} |
| \State $toupdate(i+1,dest) \set (local,e.src)$ |
| \EndIf |
| \EndFor |
| \EndFor |
| \For{every tuple $t \in toupdate(i+1)$} |
| \If{$t.val < cur(t.id).val$} |
| \State $cur(t.id) \set (t.val,t.parent)$ |
| \EndIf |
| \EndFor |
| \EndFor |
| \end{algorithmic} |
| \end{algorithm} |
| |
| \begin{description} |
| \item edge: $(src,dest,val)$. The edges of the graph. |
| \item cur: $id \rightarrow (val,parent)$. The intermediate SSSP results. |
| \item toupdate: $iter \rightarrow (id \rightarrow (val,parent))$. The set of updates. |
| \end{description} |
| |
| Changes from the standard Bellman-Ford algorithm: |
| |
| \begin{description} |
| \item Line~\ref{alg:sssp:update}: We only check the vertices that have been |
| updated in the last iteration. |
| \item Line~\ref{alg:sssp:single}: At each iteration, we update a given vertex |
| only one time. This means the toupdate set cannot contain multiple records |
| for the same vertex which requires the comparison with the existing value. |
| \end{description} |
| |
| This is not a 1-to-1 pseudocode for the implementation since we don't compare |
| the `toupdate` table records one by one but calculate the overall minimum. In |
| addition, the comparison with `cur` values take place earlier to reduce the |
| number of tuples in the `toupdate` table. |
| |
| \subsection{Implementation Details} |
| |
| In this section, we discuss the MADlib implementation of the SSSP algorithm |
| in depth. |
| |
| \begin{algorithm}[SSSP$(V,E,start)$] \label{alg:sssp:high} |
| \begin{algorithmic}[1] |
| \Repeat |
| \State Find Updates |
| \State Apply updates to the output table |
| \Until {There are no updates} |
| \end{algorithmic} |
| \end{algorithm} |
| |
| The implementation consists of two SQL blocks that are called sequentially |
| inside a loop. We will follow the example graph at Figure~\ref{sssp:example} |
| with the starting point as $v_0$. The very first update on the output table is |
| the source vertex. Its weight is $0$ and its parent is itself ($v_0$). After |
| this initialization step, the loop starts with Find Updates (the individual |
| updates will be represented with <dest,value,parent> format). Looking at the |
| example, it is clear that the updates should be <1,1,0>, <2,1,0> and <4,10,0>. |
| We will assume this iteration is already completed and look how the next |
| iteration of the algorithm works to explain the implementation details. |
| |
| \begin{algorithm}[Find Updates$(E,old\_update,out\_table)$] |
| \label{alg:sssp:findu} |
| \begin{lstlisting} |
| INSERT INTO new_update |
| SELECT DISTINCT ON (y.id) y.id AS id, |
| y.val AS val, |
| y.parent AS parent |
| FROM out_table INNER JOIN ( |
| SELECT edge_table.dest AS id, x.val AS val, old_update.id AS parent |
| FROM old_update |
| INNER JOIN edge_table |
| ON (edge_table.src = old_update.id) |
| INNER JOIN ( |
| SELECT edge_table.dest AS id, |
| min(old_update.val + edge_table.weight) AS val |
| FROM old_update INNER JOIN |
| edge_table AS edge_table ON |
| (edge_table.src=old_update.id) |
| GROUP BY edge_table.dest |
| ) x |
| ON (edge_table.dest = x.id) |
| WHERE ABS(old_update.val + edge_table.weight - x.val) < EPSILON |
| ) AS y ON (y.id = out_table.vertex_id) |
| WHERE y.val<out_table.weight |
| \end{lstlisting} |
| \end{algorithm} |
| |
| The Find Updates query is constructed in 4 levels of subqueries: \emph{find |
| values, find parents, eliminate duplicates and ensure improvement}. |
| |
| \begin{itemize} |
| |
| \item We begin our analysis at the innermost subquery, emph{find values} |
| (lines 11-16). This subquery takes a set of vertices (in the table |
| $old\_update$) and finds the reachable vertices. In case a vertex is reachable |
| by multiple vertices, only the path that has the minimum cost is considered |
| (hence the name find values). There are two important points to note: |
| \begin{itemize} |
| \item The input vertices need the value of their path as well. |
| \begin{itemize} |
| \item In our example, both $v_1$ and $v_2$ can reach $v_3$. We would |
| have to use $v_2 \rightarrow v_3$ edge since that gives the lowest possible |
| path value. |
| \end{itemize} |
| \item The subquery is aggregating the rows using the $min$ operator for |
| each destination vertex and unable to return the source vertex at the |
| same time to use as the parent value. |
| \begin{itemize} |
| \item We know the value of $v_3$ should be $2$ but we cannot know |
| its parent ($v_2$) at the same time. |
| \end{itemize} |
| \end{itemize} |
| |
| \item The \emph{find parents} subquery is designed to solve the |
| aforementioned limitation. We combine the result of \emph{find values} with |
| $edge$ and $old\_update$ tables (lines 7-10) and get the rows that has the |
| same minimum value. |
| \begin{itemize} |
| \item Note that, we would have to tackle the problem of tie-breaking. |
| \begin{itemize} |
| \item Vertex $v_5$ has two paths leading into: <5,2,1> and <5,2,2>. |
| The inner subquery will return <5,2> and it will match both of these |
| edges. |
| \end{itemize} |
| \item It is redundant to keep both of them in the update list as that |
| would require updating the same vertex multiple times in a given |
| iteration. |
| \end{itemize} |
| |
| \item At this level, we employ the \emph{eliminate duplicates} subquery. By |
| using the $DISTINCT$ clause at line 2, we allow the underlying system to |
| accept only a single one of them. |
| |
| \item Finally, we introduce the \emph{ensure improvement} subquery to make |
| sure these updates are actually leading us to shortest paths. Line 21 ensures |
| that the values stored in the $out\_table$ does not increase and the solution |
| does not regress throughout the iterations. |
| \end{itemize} |
| |
| Applying updates is straightforward as the values and the associated parent |
| values are replaced using the $new\_update$ table. After this operation is |
| completed the $new\_update$ table becomes $old\_update$ for the next iteration |
| of the algorithm. |
| |
| Please note that, for ideal performance, \emph{vertex} and \emph{edge} tables |
| should be distributed on \emph{vertex id} and \emph{source id} respectively. |
| |
| \section{All Pairs Shortest Paths} \label{sec:graph:apsp} |
| |
| Given a graph and a source vertex, all pairs shortest paths (APSP) algorithm |
| finds a path for every vertex pair such that the sum of the weights of its |
| constituent edges is minimized. Please refer to the |
| Section~\ref{sec:graph:sssp} on single source shortest path for the |
| mathematical definition of shortest path. |
| |
| Our implementation has a dynamic programming approach, based on the matrix |
| multiplication inspired APSP algorithm \cite{apsp}. The idea is similar to |
| the one from SSSP implementation. We start with a naive approximation for the |
| cost of every vertex pair (infinite). At each iteration, these values are |
| refined based on the edge list and the existing approximations. This |
| refinement step is similar to a matrix multiplication. For every vertex pair |
| $i,j$, we check every edge $e: j \rightarrow k$ to see if it is possible to |
| use $e$ to reduce the cost of path $i \rightarrow k$. If there are no |
| refinements at any given step, the algorithm returns the calculated results. |
| If the algorithm does not converge in $|V|-1$ iterations, this indicates the |
| existence of a negative cycle in the graph. |
| |
| \begin{algorithm}[APSP$(V,E)$] \label{alg:apsp} |
| \alginput{Vertex set $v$, edge set $E$} |
| \algoutput{Distance and parent set for every vertex pair} |
| \begin{algorithmic}[1] |
| |
| \While {$update$ is $True$} |
| \State $update \set False$ |
| \For{every vertex pair $i,j$} |
| \For{every edge $j \rightarrow k$} |
| \If{$ val(i \rightarrow j) + val(j \rightarrow k) < val(i \rightarrow k)$} |
| \State $val(i \rightarrow k) \set val(i \rightarrow j) + val(j \rightarrow k)$ |
| \State $parent(i \rightarrow k) \set j$ |
| \State $update \set True$ |
| \EndIf |
| \EndFor |
| \EndFor |
| \EndWhile |
| \end{algorithmic} |
| \end{algorithm} |
| |
| \subsection{Implementation Details} |
| |
| The implementation details are similar to the SSSP as the requirements and |
| restrictions such as finding the parent, distinct updates, etc. are common in |
| both cases. This section will mostly focus on the differences in the APSP |
| implementation. |
| |
| \begin{algorithm}[Find Updates$(E,out)$] |
| \label{alg:apsp:findu} |
| \begin{lstlisting} |
| INSERT INTO update |
| SELECT DISTINCT ON (y.src, y.dest) y.src AS src, y.dest AS dest |
| y.val AS val, |
| y.parent AS parent |
| FROM out INNER JOIN ( |
| SELECT |
| x.src AS src, x.dest AS dest, |
| x.val AS val, out.dest AS parent |
| FROM out |
| INNER JOIN edge_table |
| ON (edge_table.src = out.dest) |
| INNER JOIN ( |
| SELECT out.src AS src, edge_table.dest AS dest, |
| min(out.val + edge_table.weight) AS val |
| FROM out INNER JOIN |
| edge_table ON |
| (edge_table.src=out.dest) |
| GROUP BY out.src, edge_table.dest |
| ) x |
| ON (edge_table.src = x.src AND edge_table.dest = x.dest) |
| WHERE ABS(out.val + edge_table.weight - x.val) < EPSILON |
| ) AS y ON (y.src = out.src AND y.dest = out.dest) |
| WHERE y.val < out.val |
| \end{lstlisting} |
| \end{algorithm} |
| |
| The only major difference comes in the innermost subquery (lines 13-18). The |
| \emph{group by} clause ensures that we try to reduce the weight for every |
| $out.src$ ($i$) and $edge\_table.dest$ ($k$) pair. The \emph{inner join on} |
| clause ensures that there is a connecting edge ($j\rightarrow k$) that can be |
| used for the $i,j$ pair. The rest of the changes are mostly trivial as the |
| algorithm needs to check for both source and destination during joins (instead |
| of just the destination). |
| |
| |
| \section{PageRank} \label{sec:graph:pagerank} |
| \begin{figure}[h] |
| \centering |
| \includegraphics[width=0.5\textwidth]{figures/pagerank_example.pdf} |
| \caption{An example graph for PageRank} |
| \label{pagerank:example} |
| \end{figure} |
| |
| PageRank is a link analysis algorithm that assigns a score to every vertex |
| measuring the relative importance of vertices within the set of all |
| vertices. PageRank~\cite{pagerank} was first used by Google to measure the |
| importance of website pages where the World Wide Web was modeled as a directed |
| graph. Figure~\ref{pagerank:example} shows an example graph with the PageRank |
| value of each vertex. The intuition behind the algorithm is that the number and |
| quality of links to a vertex determine the authoritativeness of the vertex, |
| which is reflected in the PageRank scores as shown in the figure. |
| |
| The pagerank module in MADlib implements the model of a random surfer who |
| follows the edges of a graph to traverse it, and jumps to a random vertex |
| after several clicks. The random surfer is modeled using a damping factor |
| that represents the probability with which the surfer will continue to follow |
| links in the graph rather than jumping to a random vertex. MADlib's pagerank |
| module outputs a probability distribution that represents the likelihood that |
| the random surfer arrives at a particular vertex in the graph. |
| |
| PageRank is an iterative algorithm where the PageRank scores of vertices from |
| the previous iteration are used to compute the new PageRank scores. The |
| PageRank score of a vertex $v$, at the $i^{th}$ iteration, denoted by $PR(v_i)$ |
| is computed as: |
| |
| \begin{equation} |
| PR(v_i) = \frac{1-d}{N} + d \sum_{u \in M(v)}(\frac{PR(u_{i-1})}{L(u)}) |
| \label{eq:pagerank} |
| \end{equation} |
| |
| where $N$ is the number of vertices in the graph, $d$ is the damping factor, |
| $M(v)$ represents the set of vertices that have an edge to vertex $v$, |
| $L(u)$ represents the out-degree of vertex $u$, i.e., the number of |
| out-going edges from vertex $u$, and $PR(u_{i-1})$ represents the PageRank |
| score of vertex $u$ in the $(i-1)^{st}$ iteration. |
| |
| $\frac{1-d}{N}$ represents the tiny probability with which the surfer |
| would randomly jump to vertex $v$, rather than arriving at $v$ following |
| links in the graph. This ensures that there is some probability of visiting |
| every vertex in the graph even if they do not have any incoming edges. Note |
| that the PageRank score computed for a vertex $v$ using~\ref{eq:pagerank} |
| in the $i^{th}$ iteration is not updated until the new score is computed for |
| all the vertices in the graph. The computation terminates either when the |
| PageRank score of no vertex changes beyond a threshold across two consecutive |
| iterations, or when a pre-set number of iterations are completed. |
| |
| \paragraph{Personalized Pagerank:} |
| The Personalized Pagerank variant of Pagerank module in MADlib takes an extra argument as a set of user provided vertices. |
| These personalization vertices will have a higher jump probability as compared to other vertices and random surfer is more likely to jump on these personalization vertices. These personalization vertices are initialized with an initial probabilty of $\frac{1}{N}$ where $N$ is the total number of personlaized vertices in the graph and rest of the vertices in the graph are assigned an initial probability of 0. Pagerank calculated for these vertices is biased as a random jump probability is assigned to only these vertices during the pagerank calculation,which is equal to (1 - damping factor). |
| |
| \subsection{Implementation Details} \label{sec:pagerank:implementation} |
| |
| In this section, we discuss the MADlib implementation of PageRank in depth. |
| We maintain two tables at every iteration: $previous$ and $cur$. The |
| $previous$ table maintains the PageRank scores of all vertices computed in |
| the previous iteration, while $cur$ maintains the updated scores of all |
| vertices in the current iteration. |
| |
| \begin{algorithm}[PageRank$(V,E)$] \label{alg:pagerank:high} |
| \begin{algorithmic}[1] |
| \State Create $previous$ table with a default PageRank score of |
| $\frac{1}{N}$ for every vertex |
| \Repeat |
| \State Create empty table $cur$. |
| \State Update $cur$ using PageRank scores of vertices in $previous$ |
| \State Update PageRank scores of vertices without incoming edges |
| \State Drop $previous$ and rename $cur$ to $previous$ |
| \Until {PageRank scores have converged or \emph{max} iterations have elapsed} |
| \end{algorithmic} |
| \end{algorithm} |
| |
| The implementation consists of updating the PageRank scores of all vertices |
| at every iteration, using the PageRank scores of vertices from the previous |
| iteration. The PageRank score of every vertex is initialized to $\frac{1}{N}$ |
| where $N$ is the total number of vertices in the graph. The out-degree of |
| every vertex in the graph (represented by $L(u)$ in eq.~\ref{eq:pagerank}), |
| is captured in table $out\_cnts$. The following query is used to create and |
| update the PageRank scores in $cur$ table using the PageRank scores in |
| $previous$ table. |
| |
| \begin{algorithm}[Update PageRank scores$(previous,out\_cnts,d,N)$] |
| \label{alg:pagerank:update} |
| \begin{lstlisting} |
| CREATE TABLE cur AS |
| SELECT edge_table.dest AS id, |
| SUM(previous1.pagerank/out_cnts.cnt)*d + (1-d)/N AS pagerank |
| FROM edge_table |
| INNER JOIN previous ON edge_table.dest = previous.id |
| INNER JOIN out_cnts ON edge_table.src = out_cnts.id |
| INNER JOIN previous AS previous1 ON edge_table.src = previous1.id |
| GROUP BY edge_table.dest |
| |
| -- Update PageRank scores of vertices without any incoming edges: |
| INSERT INTO cur |
| SELECT id, (1-d)/N AS pagerank |
| FROM previous |
| WHERE id NOT IN ( |
| SELECT id |
| FROM cur |
| ) |
| \end{lstlisting} |
| \end{algorithm} |
| |
| The PageRank computation is terminated either when a fixed number of iterations |
| are completed, or when the PageRank scores of all vertices have converged. The |
| PageRank score of a vertex is deemed converged if the absolute difference in |
| its PageRank scores from $previous$ and $cur$ is less than a specified threshold. |
| The following query is used to find all the vertices whose PageRank scores have |
| not converged yet. |
| |
| \begin{algorithm}[Update PageRank scores$(previous,cur,threshold)$] |
| \label{alg:pagerank:update} |
| \begin{lstlisting} |
| SELECT id |
| FROM cur |
| INNER JOIN previous ON cur.id = previous.id |
| WHERE ABS(previous.pagerank - cur.pagerank) > threshold |
| \end{lstlisting} |
| \end{algorithm} |
| |
| \subsection{Best Practices} \label{sec:pagerank:bestpractices} |
| |
| The pagerank module in MADlib has a few optional parameters: damping factor |
| $d$, number of iterations $max$, and the threshold for convergence $threshold$. |
| The default values for these parameters when not specified by the user are |
| $0.85$, $100$ and $\frac{1}{N*1000}$ respectively. |
| |
| The damping factor denotes the probability with which the surfer uses the edges |
| to traverse the graph. If set to $0$, it implies that the only way a surfer |
| would visit a vertex in the graph is by randomly jumping to it. If set to |
| $1$, it implies that the only way the surfer can reach a vertex is by following |
| the edges in the graph, thus precluding the surfer from reaching a vertex |
| that has no incoming edges. It is common practice to set damping factor |
| to $0.85$~\cite{pagerank}, and the maximum number of iterations to $100$. |
| The convergence test for PageRank in MADlib checks for the delta between |
| the PageRank scores of a vertex across two consecutive iterations. Since |
| the initial value of the PageRank score is set to $\frac{1}{N}$, the delta |
| will be small in the initial iterations when $N$ is large (say over 100 |
| million). We thus set the threshold to $\frac{1}{N*1000}$, and it is to be |
| noted that this is not based on any experimental study. Users of MADlib are |
| encouraged to consider this factor when setting a value for threshold, since |
| a high $threshold$ value would lead to early termination of PageRank |
| computation, thus resulting in incorrect PageRank values. |
| |
| |
| \section{Weakly Connected Components} \label{sec:graph:wcc} |
| \begin{figure}[h] |
| \centering |
| \includegraphics[width=0.5\textwidth]{figures/wcc_example.pdf} |
| \caption{An example disconnected directed graph} |
| \label{wcc:example} |
| \end{figure} |
| |
| Given a directed graph $G$, a weakly connected component is a subgraph |
| $G_{sub}$ of $G$, such that there exists a path from every vertex in $G_{sub}$ |
| to every other vertex in $G_{sub}$, ignoring the direction of the edges. |
| |
| The weakly connected component module implemented in MADlib is based on |
| GRAIL~\cite{grail}. All vertices are initialized with their own vertex |
| ID as the component ID, and are considered to be active. In every iteration, |
| each active vertex's component ID is updated with the smallest component ID |
| value of all its neighbors. Any vertex whose component ID is not updated in |
| the current iteration is deemed as an inactive vertex for the next iteration. |
| Execution continues until there are no active vertices left. Since each vertex |
| is initialized with its own ID as the component ID, and updated based on |
| neighboring nodes' component IDs, the final component ID of a component will |
| be equal to the smallest vertex ID in the corresponding subgraph. |
| Figure~\ref{wcc:example} shows an example directed graph with two disconnected |
| subgraphs. The subgraph containing vertices $1$, $2$, $3$, $4$, $5$ and $6$ |
| forms a weakly connected component, and is assigned component ID 1, while the |
| subgraph containing vertices $12$, $14$, $21$ and $23$ forms the second component |
| and is assigned component ID 12. |
| |
| \subsection{Implementation Details} \label{sec:wcc:implementation} |
| |
| In this section, we discuss the MADlib implementation of weakly connected |
| components in depth. We maintain the following tables at every iteration: |
| $oldupdate$, $message$ and $newupdate$. In $newupdate$, the component ID |
| of each vertex is initialized to infinity, while the component ID of vertices |
| in the $message$ table is initialized to their corresponding vertex ID. |
| |
| \begin{algorithm}[Weakly Connected Components$(V,E)$] \label{alg:wcc:high} |
| \begin{algorithmic}[1] |
| \State Create $newupdate$ table with a default component ID of |
| $infinity$ for every vertex |
| \State Create $message$ table with a default component ID of the |
| corresponding $id$ (vertex ID) for every vertex |
| \Repeat |
| \State Update the $oldupdate$ table |
| \State Update $toupdate$ table with active vertices |
| \State Update the $newupdate$ table |
| \State Update $message$ table with potential new component IDs for each vertex |
| \Until {There are no active vertices in $toupdate$ table} |
| \end{algorithmic} |
| \end{algorithm} |
| |
| The $message$ table contains the component IDs associated with all its |
| immediate neighbors. At each iteration, $oldupdate$ is updated with the |
| minimum of all the associated component IDs found for a vertex in $message$. |
| |
| \begin{algorithm}[Update oldupdate table] |
| \begin{lstlisting} |
| SELECT id, MIN(message.component_id) as component_id |
| FROM message |
| GROUP BY id |
| \end{lstlisting} |
| \end{algorithm} |
| |
| Table $toupdate$ records all vertices whose component IDs must be updated, |
| and are thus marked active. |
| |
| \begin{algorithm}[Update toupdate table with active vertices] |
| \begin{lstlisting} |
| -- Find vertices whose component ID must be updated |
| CREATE TABLE toupdate AS |
| SELECT id, component_id |
| FROM oldupdate, newupdate |
| WHERE oldupdate.id = newupdate.id AND |
| oldupdate.component_id < newupdate.component_id |
| |
| -- Update the component IDs |
| UPDATE newupdate SET |
| component_id = toupdate.component_id |
| FROM toupdate |
| WHERE newupdate.id = toupdate.id |
| \end{lstlisting} |
| \end{algorithm} |
| |
| Finally, the $message$ table is updated with potential new |
| component IDs for active vertices using the following query: |
| |
| \begin{algorithm}[Update message table$(toupdate, edge)$] |
| \label{alg:wcc:message} |
| \begin{lstlisting} |
| CREATE TEMP TABLE message AS |
| SELECT id, MIN(component_id) AS component_id |
| FROM ( |
| SELECT edge.src AS id, |
| toupdate.component_id |
| FROM toupdate, edge |
| WHERE edge.dest = toupdate.id |
| UNION ALL |
| SELECT edge.dest AS id, |
| toupdate.component_id |
| FROM toupdate, edge |
| WHERE edge.src = toupdate.id |
| ) AS t |
| GROUP BY id |
| \end{lstlisting} |
| \end{algorithm} |
| |
| At the end of the computation, $newupdate$ will have the component ID |
| associated with each vertex in $G$. The component ID of all the vertices |
| in a component is equal to the smallest vertex ID in the corresponding |
| subgraph. |
| |
| \subsection{Edge Table Duplication} \label{sec:wcc:duplication} |
| |
| The queries explained in the Section~\ref{sec:wcc:implementation} expose a |
| potential performance drawback in Greenplum systems. In general, we advise |
| that the edge tables should be distributed by their source columns. However, |
| in WCC, we use both source and destination columns of the edge table in JOIN |
| clauses. In addition, we employ a GROUP BY clause using the column that did |
| not serve as the join key. Algorithm~\ref{alg:wcc:message} shows that when $dest$ |
| is used for the JOIN clause, $src$ is renamed to $id$ to be used for GROUP BY |
| and vice versa. This query forces multiple redistribute motions in the |
| database which might cause performance degradation. To address this issue, we |
| create a duplicate of the edge table and distribute on the destination column |
| (only for Greenplum systems). |
| |
| \section{Breadth-first Search} \label{sec:graph:bfs} |
| |
| Given a graph $G$ and a user-specified origin vertex $src$, this algorithm |
| searches and discovers connected nodes in a graph in breadth-first order |
| \cite{bfs_wikipedia}. The graph can be treated as either directed or |
| undirected based on a parameter specified when calling the function. |
| There is also a parameter to control the number of hops (edges) to traverse |
| from the source vertex. If not specified, all nodes accessible from the |
| source node will be discovered. |
| |
| \subsection{Implementation Details} |
| \begin{algorithm}[Breadth First Search$(V, E, src)$] \label{alg:bfs:high} |
| \begin{algorithmic}[1] |
| \State Set $dist \leftarrow 0$ |
| \State Create $message$ table with $src$ vertex, $NULL$ parent, and $dist$ |
| \State Copy $message$ table to output table $out$ |
| \Repeat |
| \State Create $toupdate$ table using $out$ and $message$ tables |
| \State $dist \leftarrow dist + 1$ |
| \State Update $message$ table with newly found candidate vertices, parent and $dist$ |
| \State Copy $message$ table to $out$ |
| \Until {There are no candidate vertices remaining in $message$ table} |
| \end{algorithmic} |
| \end{algorithm} |
| |
| The implementation details are similar to SSSP, albeit simpler. We only have to |
| track the number of hops and not the sum of weights, but other requirements and |
| restrictions such as finding the parent, distinct updates, etc. are common in |
| both cases. The output table is initialized only to the $src$ vertex to begin |
| with. A $message$ table is also maintained that contains the list of vertices |
| to traverse and explore in the next iteration, which is also initialized with |
| the $src$ vertex. BFS then runs iteratively until no more candidate vertices |
| remain in the $message$ table, as outlined in~\ref{alg:bfs:high}. |
| |
| At every iteration, $toupdate$ table is updated with vertices that are neighbors |
| of vertices in the $message$ table, that are not already visited in the past |
| (one scan of the $out$ table reveals all the vertices that have already been |
| visited in the past). All such newly found neighboring vertices in the current |
| iteration will have one or more parents, based on how many vertices in the |
| $message$ table have a direct edge to them. Each such vertex in the $message$ |
| table is marked as the parent of such newly found neighboring vertices in |
| the $toupdate$ table. |
| |
| The $message$ table is then cleared and updated with the contents of $toupdate$ |
| table, except that for each new neighboring vertex considered, only one of the |
| parents is recorded as its parent (the node with the smallest id among all |
| parent nodes). The content of this updated $message$ is then copied |
| to the $out$ table, and this process continues until $message$ table is empty. |
| |
| |
| \section{HITS} \label{sec:graph:hits} |
| \begin{figure}[h] |
| \centering |
| \includegraphics[width=0.5\textwidth]{figures/hits_example.pdf} |
| \caption{An example graph for HITS showing normalized hub and authority scores} |
| \label{hits:example} |
| \end{figure} |
| |
| Hyperlink-Induced Topic Search (HITS)~\cite{hits} developed by Jon Kleinberg is |
| a link analysis algorithm that rates Web pages. The idea behind the algorithm is |
| to assign Hub and Authority scores to all vertices. Authorities are analagous to |
| web pages that have good authoritative content and get linked by other web pages. |
| Hubs can be thought of as large directories that themselves do not hold any |
| authoritative content but provide direct links to other authoritative pages. |
| |
| HITS is an iterative algorithm where the Hub and Authority scores of vertices |
| from the previous iteration are used to compute the new scores. The |
| Hub and Authority scores of a vertex $v$, at the $i^{th}$ iteration, denoted by |
| $HUB(v_i)$ and $AUTHORITY(v_i)$ is computed as: |
| \begin{equation} |
| \begin{aligned} |
| AUTHORITY(v_i) = \sum_{u \in M(v)}({HUB(u_{i-1})})\\ |
| HUB(v_i) = \sum_{u \in M(v)}({AUTHORITY(v_{i})}) |
| \end{aligned} |
| \label{eq:hits} |
| \end{equation} |
| |
| where $N$ is the number of vertices in the graph, $M(v)$ represents the set of |
| vertices that have an edge to vertex $v$, and $HUB(u_{i-1})$ and $AUTHORITY |
| (v_{i})$ represent the Hub score of vertex {u} in the $(i-1) ^{th}$ iteration and |
| Authority score of vertex $v$ in the $(i)^{th}$ iteration. |
| |
| \subsection{Implementation Details} \label{sec:hits:implementation} |
| |
| In this section, we discuss the MADlib implementation of HITS in depth. |
| We maintain two tables at every iteration: $message$ and $cur$. The |
| $cur$ table maintains the Hub and Authority scores of all vertices |
| computed in the previous iteration, while $message$ maintains the updated scores |
| of all vertices in the current iteration. |
| |
| \begin{algorithm}[HITS$(V,E)$] \label{alg:hits:high} |
| \begin{algorithmic}[1] |
| \State Create $cur$ table with a default Hub and Authority score of |
| ${1}$ for every vertex |
| \Repeat |
| \State Create empty table $message$. |
| \State Update Authority score in $message$ using Hub scores of vertices |
| in $cur$ |
| \State Update Hub score in $message$ using Authority scores of vertices |
| in $message$ |
| \State Normalize Hub and Authority scores in $message$ using L2 |
| normalization |
| \State Rename $message$ to $cur$ |
| \Until {both Authority and Hub scores have converged or \emph{max} |
| iterations have elapsed} |
| \end{algorithmic} |
| \end{algorithm} |
| |
| The following query is used to create and update the Hub and Authority scores |
| in $message$ table using the Hub scores in $cur$ table. |
| |
| \begin{algorithm}[Update Hub and Authority scores$(cur, edge\_table)$] |
| \label{alg:hits:update} |
| \begin{lstlisting} |
| -- Create message table and update authority scores |
| CREATE TABLE message AS |
| SELECT cur.id AS id, |
| COALESCE(SUM(curalias.hub), 0.0) AS authority, |
| cur.hub AS hub |
| FROM cur |
| LEFT JOIN edge_table ON cur.id = edge_table.dest |
| LEFT JOIN cur AS curalias ON curalias.id = edge_table.dest |
| GROUP BY cur.id, cur.hub |
| ORDER BY cur.id |
| |
| -- Update hub scores in message table: |
| UPDATE message |
| SET hub = subquery.hub FROM |
| ( |
| SELECT message.id AS id, COALESCE(SUM(msgalias.authority), 0) AS hub |
| FROM message |
| LEFT JOIN edge_table ON message.id = edge_table.src |
| LEFT JOIN message AS msgalias ON message.id = edge_table.dest |
| GROUP BY message.id |
| ) AS subquery |
| WHERE subquery.id = message.id |
| |
| \end{lstlisting} |
| \end{algorithm} |
| |
| |
| The Hub and Authority computations are terminated either when a fixed number of |
| iterations are completed, or when both the Hub and Authority scores of all |
| vertices have converged. The Hub/Authority score of a vertex is deemed |
| converged if the absolute difference in its Hub/Authority scores from $cur$ and |
| $message$ are less than a specified threshold. |
| The following query is used to find all the vertices whose Hub/Authority scores |
| have not converged yet. |
| \begin{algorithm}[Check for Hub and Authority convergence$(cur, message, |
| threshold)$] |
| \label{alg:hits:update1} |
| \begin{lstlisting} |
| SELECT DISTINCT cur.id FROM message |
| INNER JOIN cur ON cur.id=message.id |
| WHERE ABS(cur.authority-message.authority) > threshold |
| OR ABS(cur.hub-message.hub) > threshold |
| \end{lstlisting} |
| \end{algorithm} |
| |
| \subsection{Best Practices} \label{sec:hits:bestpractices} |
| |
| The HITS module in MADlib has a few optional parameters: number of iterations $max$, |
| and the threshold for convergence $threshold$. |
| The default values for these parameters when not specified by the user are $100$ |
| and $\frac{1}{N*1000}$ respectively. |
| It is to be noted that this is not based on any experimental study. Users of |
| MADlib are encouraged to consider this factor when setting a value for threshold, |
| since a high $threshold$ value would lead to early termination of computation, |
| thus resulting in incorrect Hub and Authority scores. |