| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script performs DeepWalk on a given graph (https://arxiv.org/pdf/1403.6652.pdf) |
| # |
| # INPUT: |
| # ------------------------------------------------------------------------------------ |
| # Graph adjacency matrix of a graph (n x n) |
| # w window size |
| # d embedding size |
| # gamma walks per vertex |
| # t walk length |
| # alpha learning rate |
| # beta factor for decreasing learning rate |
| # ------------------------------------------------------------------------------------ |
| # |
| # OUTPUT: |
| # ------------------------------------------------------------------------------------------ |
| # Phi matrix of vertex/word representation (n x d) |
| # ------------------------------------------------------------------------------------------ |
| |
| source("scripts/staging/entity-resolution/primitives/postprocessing.dml") as post; |
| |
| m_deepWalk = function(Matrix[Double] Graph, Integer w, Integer d, |
| Integer gamma, Integer t, Double alpha=0.025, Double beta=0.9) |
| return(Matrix[Double] Phi) |
| { |
| word_count = nrow(Graph) |
| tree_depth = ceil(log(word_count, 2)) |
| |
| # build binary tree from vocabulary |
| # TODO: currently we make a full binary tree with a depth depending on the word count e.g.: |
| # TODO: for a word count of 5, we create a binary tree with 8 leaves and 7 nodes |
| T = createBinaryTreeMatrix(word_count) |
| |
| # initialize the the node vectors and the representation matrix Phi with random values [0, 1] |
| # TODO: we initialize full binary tree -> not necessary if we have less leaves |
| Theta = rand(rows=2^tree_depth-1, cols=d) |
| Phi = rand(rows=word_count, cols=d) |
| |
| for (i in 1:gamma) { |
| vocab_shuffled = sample(word_count, word_count, FALSE) |
| for (node_idx in 1:length(vocab_shuffled)) { |
| random_walk = randomWalk(Graph, as.scalar(vocab_shuffled[node_idx]), t) |
| [Phi, Theta] = skipGram(Phi, Theta, T, random_walk, w, alpha) |
| } |
| # decreasing learning rate |
| alpha = alpha * beta |
| } |
| } |
| |
| # binary tree as adjacency matrix |
| createBinaryTreeMatrix = function(Integer word_count) |
| return(Matrix[Double] Tree) |
| { |
| rix = matrix(seq(1,word_count-1) %*% matrix(1,1,2), 2*(word_count-1), 1) |
| cix = 1+seq(1,2*(word_count-1)); |
| Tree = table(rix, cix); # place 1s into output |
| } |
| |
| randomWalk = function(Matrix[Double] Graph, Integer start_vertex, Integer walk_length) |
| return(Matrix[Double] random_walk) |
| { |
| random_walk = matrix(0, rows=walk_length+1, cols=1) |
| random_walk[1] = start_vertex |
| |
| current_vertex = start_vertex |
| for (i in 1:walk_length) { |
| neighbors_untable = post::untable(Graph[current_vertex, ]) |
| neighbors = neighbors_untable[, 2] |
| index = as.scalar(sample(nrow(neighbors), 1)) |
| current_vertex = as.scalar(neighbors[index]) |
| random_walk[i+1] = current_vertex |
| } |
| } |
| |
| skipGram = function(Matrix[double] Phi, Matrix[double] Theta, |
| Matrix[double] Tree, Matrix[double] walk, Integer window_size, Double alpha) |
| return(Matrix[double] Phi_new, Matrix[double] Theta_new) |
| { |
| Phi_new = Phi |
| Theta_new = Theta |
| tree_depth = ceil(log(nrow(Phi), 2)) |
| |
| # TODO: check if parallelizable for different datasets; works for current test -> sparsity, updates |
| parfor (w_i in 1:nrow(walk), check=0) { |
| # calculate furthest left/right node in the window |
| min_val = max(1, w_i-window_size) |
| max_val = min(nrow(walk), w_i+window_size) |
| |
| if (w_i != 1) { |
| left_neighbors = walk[min_val:(w_i-1)] |
| for (u_k in 1:nrow(left_neighbors)) { |
| [Phi_new, Theta_new] = update(Phi_new, Theta_new, |
| Tree, tree_depth, as.scalar(left_neighbors[u_k]), as.scalar(walk[w_i]), alpha) |
| } |
| } |
| if (w_i != nrow(walk)) { |
| right_neighbors = walk[(w_i+1):max_val] |
| for (u_k in 1:nrow(right_neighbors)) { |
| [Phi_new, Theta_new] = update(Phi_new, Theta_new, |
| Tree, tree_depth, as.scalar(right_neighbors[u_k]), as.scalar(walk[w_i]), alpha) |
| } |
| } |
| } |
| } |
| |
| update = function(Matrix[double] Phi, Matrix[double] Theta, |
| Matrix[double] Tree, Integer tree_depth, Integer u, Integer v, Double alpha) |
| return (Matrix[double] Phi_new, Matrix[double] Theta_new) |
| { |
| Phi_new = Phi |
| Theta_new = Theta |
| |
| u_binary = toBinaryArray(u, tree_depth) |
| path_to_u = getNodeIds(Tree, u_binary) |
| |
| gradients = computeGradients(u, v, Theta, path_to_u, Phi, tree_depth) |
| |
| # compute negative gradient for Theta update |
| neg_gradient = gradients * Phi[v] |
| for (i in 1:nrow(gradients)) |
| Theta_new[as.scalar(path_to_u[i])] = Theta_new[as.scalar(path_to_u[i])] + alpha * neg_gradient[i] |
| |
| # compute negative gradient for Phi update |
| P = table(seq(1,tree_depth), path_to_u[1:tree_depth], tree_depth, nrow(Theta)); |
| target_theta = P %*% Theta; |
| neg_gradient = t(gradients) %*% target_theta |
| Phi_new[v] = Phi_new[v] + alpha * neg_gradient |
| } |
| |
| computeGradients = function(Integer u, Integer v, Matrix[double] Theta, |
| Matrix[double] path, Matrix[double] Phi, Integer tree_depth) |
| return(Matrix[Double] gradients) |
| { |
| u_binary = toBinaryArray(u, tree_depth) |
| P = table(seq(1,tree_depth), path, tree_depth, nrow(Theta)); |
| u_dot_v = P %*% Theta %*% t(Phi[v]); |
| gradients = u_binary - sigmoid(u_dot_v); |
| } |
| |
| toBinaryArray = function(Integer node_id, Integer tree_depth) |
| return(Matrix[double] binary_array) |
| { |
| binary_array = matrix(0, rows=tree_depth, cols=1) |
| for (i in 1:tree_depth) { |
| binary_array[i] = node_id %% 2 |
| node_id = node_id %/% 2 |
| } |
| } |
| |
| # based on a binary sequence get the correct path in the tree |
| # return a list of node ids on the path to the leaf without the leaf node itself |
| getNodeIds = function(Matrix[double] Tree, Matrix[double] binary_array) |
| return(Matrix[double] node_id_array) |
| { |
| seq_len = nrow(binary_array) |
| node_id_array = matrix(0, rows=seq_len, cols=1) |
| cur_node_id = 1 |
| node_id_array[1] = cur_node_id |
| for (i in 1:(seq_len-1)) { |
| neighbors_untable = post::untable(Tree[cur_node_id, ]) |
| neighbors = neighbors_untable[, 2] |
| node_id_array[i+1] = as.scalar(neighbors[1 + as.scalar(binary_array[i])]) |
| cur_node_id = as.scalar(node_id_array[i+1]) |
| } |
| } |