blob: baa9aeec83243ac020e90659967fec379e525d33 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Splits the rows of X into num_blocks non-overlapping regions.
# May produce less than num_blocks regions.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- A dataset with rows to split into blocks.
# num_blocks Integer --- How many blocks to produce.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# BLOCKS Double A column vector with start indices for each block. Has one more row
# than blocks such that the last row is the end index of the last block.
# --------------------------------------------------------------------------------------------
naive_blocking = function(Matrix[Double] X, Integer num_blocks) return (Matrix[Double] BLOCKS) {
block_size_flt = nrow(X) / num_blocks;
if (block_size_flt < 1.0) {
BLOCKS= seq(1, nrow(X) + 1);
} else {
block_size = ceil(block_size_flt);
BLOCKS = rbind(as.matrix(1), 1 + block_size * seq(1, num_blocks));
BLOCKS[num_blocks+1,] = nrow(X) + 1
}
}
# Sorts the rows in dataset X by the vector v. The order is defined by the ascending order of v.
# Can return the vector v as first column of X, depending on parameter prepend_v.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- Any matrix with rows to be sorted.
# v matrix --- A vector with the same number of rows as X. Defines ordering.
# prepend_v boolean --- Whether to return v as first column of X.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_index matrix A vector with the original row number for each row in X.
# X_sorted matrix The sorted input matrix X.
# --------------------------------------------------------------------------------------------
sort_by_vector = function(Matrix[Double] X, Matrix[Double] v, Boolean prepend_v) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
X_index = order(target=v, by=1, decreasing = FALSE, index.return=TRUE);
X_sorted = order(target=cbind(v, X), by=1, decreasing = FALSE, index.return=FALSE);
if (!prepend_v) {
X_sorted = X_sorted[,2:ncol(X_sorted)];
}
}
# Sorts the rows in dataset X by their sum.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- Any matrix with rows to be sorted.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_index matrix A vector with the original row number for each row in X.
# X_sorted matrix The sorted input matrix X.
# --------------------------------------------------------------------------------------------
row_sum_sorting = function(Matrix[Double] X) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
X_rowSum = rowSums(X);
[X_index, X_sorted] = sort_by_vector(X, X_rowSum, FALSE);
}
# Sorts the rows in dataset X by their original row numbers (index).
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- Any matrix with rows to be sorted.
# X_index matrix --- The orginal row numbers to be restored.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_reindex matrix The reindexed matrix X.
# --------------------------------------------------------------------------------------------
reindex_rowwise = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
X_conc = cbind(X_index, X);
X_reindex = order(target=X_conc, by=1, decreasing = FALSE, index.return=FALSE);
# Remove index column
X_reindex = X_reindex[,2:ncol(X_reindex)];
}
# Sorts both rows and columns in dataset X by their original row numbers (index).
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- Any matrix that should be resorted.
# X_index matrix --- The orginal row numbers to be restored.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_reindex matrix The reindexed matrix X.
# --------------------------------------------------------------------------------------------
reindex_rows_and_cols = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
# First reindex rows
X_reindex = X;
X_reindex = reindex_rowwise(X_reindex, X_index);
# Then transpose and repeat
X_reindex = t(X_reindex);
X_reindex = reindex_rowwise(X_reindex, X_index);
}
# Generates a random matrix of hyperplane parameters.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# num_hyperplanes Integer --- How many hyperplanes to generate.
# dimension Integer --- The number of parameters per hyperplane.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# H matrix A num_hyperplanes x dimension matrix of hyperplane parameters.
# --------------------------------------------------------------------------------------------
gen_rand_hyperplanes = function(Integer num_hyperplanes, Integer dimension) return (Matrix[Double] H) {
H = rand(rows=num_hyperplanes, cols=dimension, min=-1, max=1);
}
# Creates a scalar hash code for each row in X_plane by interpreting positive values as a
# binary number.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X_plane matrix --- The result of a plane equation for each row in X.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_hash matrix A row vector of scalar hash codes. Same number of rows as X_plane.
# --------------------------------------------------------------------------------------------
aggregate_lsh_code = function(Matrix[Double] X_code) return (Matrix[Double] X_hash) {
X_pos = (X_code > 0);
pos_weights = 2^seq(ncol(X_code)-1, 0);
X_weights = X_pos * t(pos_weights);
X_hash = rowSums(X_weights);
}
# Computes locality-sensitive hash codes for each row in X.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- A matrix that should be hashed.
# H matrix --- Hyperplanes matrix w. shape (num_hyperplanes, ncol(X))
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_hash matrix A column vector with hash codes. Same number of rows as X.
# --------------------------------------------------------------------------------------------
lsh = function(Matrix[Double] X, Matrix[Double] H) return (Matrix[Double] X_hash) {
X_plane = X %*% t(H);
X_hash = aggregate_lsh_code(X_plane);
}
lsh_find_blocks = function(Matrix[Double] X, Integer dimensions) return (Matrix[Double] BLOCKS){
BLOCKS = matrix(0, rows=2^dimensions+1, cols=1)
current_hash_value = 0.0;
BLOCKS[1,1] = 1
for (row_idx in 1:nrow(X)) {
this_hash = as.scalar(X[row_idx,1]);
if (current_hash_value < this_hash) {
for (h in (current_hash_value+2):(this_hash+1)) {
BLOCKS[h,1] = row_idx;
}
current_hash_value = this_hash;
}
}
n_missing_rows = nrow(BLOCKS) - (current_hash_value + 1);
BLOCKS[(current_hash_value+2):nrow(BLOCKS),] = matrix(nrow(X) + 1, rows=n_missing_rows, cols=1);
}
# Blocks the given matrix X by locality sensitive hashing using a set of random hyperplanes.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- A matrix that should be blocked by LSH.
# num_hyperplanes Integer --- How many hyperplanes to use.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_index matrix A vector with the original row number for each row in X.
# X_hash matrix A column vector with the scalar LSH hash code for each row in X.
# X_sorted matrix The sorted input matrix X.
# BLOCKS Double A column vector with start indices for each block. Has one more row
# than blocks such that the last row is the end index of the last block.
# --------------------------------------------------------------------------------------------
lsh_blocking = function(Matrix[Double] X, Integer num_hyperplanes) return (Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted, Matrix[Double] BLOCKS) {
H = gen_rand_hyperplanes(num_hyperplanes, ncol(X));
X_hash = lsh(X, H);
[X_index, X_sorted_with_hash] = sort_by_vector(X, X_hash, TRUE);
BLOCKS = lsh_find_blocks(X_sorted_with_hash, num_hyperplanes);
B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], as.matrix(1));
BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
X_sorted = X_sorted_with_hash[,2:ncol(X_sorted_with_hash)];
X_hash = X_sorted_with_hash[,1];
}
# Blocks the matrices X and Y by locality sensitive hashing using a set of random hyperplanes.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
# X matrix --- A matrix that should be blocked by LSH.
# Y matrix --- A matrix that should be blocked by LSH.
# num_hyperplanes Integer --- How many hyperplanes to use.
#
# Output:
# --------------------------------------------------------------------------------------------
# NAME TYPE MEANING
# --------------------------------------------------------------------------------------------
# X_index matrix A vector with the original row number for each row in X.
# X_hash matrix A column vector with the scalar LSH hash code for each row in X.
# X_sorted matrix The sorted input matrix X.
# Y_index matrix A vector with the original row number for each row in Y.
# Y_hash matrix A column vector with the scalar LSH hash code for each row in Y.
# Y_sorted matrix The sorted input matrix Y.
# BLOCKS Double A matrix with start indices for each block. Has one more row
# than blocks such that the last row is the end index of the last block.
# First column is X, second column Y indices.
# --------------------------------------------------------------------------------------------
lsh_blocking2 = function(Matrix[Double] X, Matrix[Double] Y, Integer num_hyperplanes) return (
Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted,
Matrix[Double] Y_index, Matrix[Double] Y_hash, Matrix[Double] Y_sorted,
Matrix[Double] BLOCKS) {
H = gen_rand_hyperplanes(num_hyperplanes, max(ncol(X), ncol(Y)));
X_hash = lsh(X, H[,1:ncol(X)]);
Y_hash = lsh(Y, H[,1:ncol(Y)]);
[X_index, Xs] = sort_by_vector(X, X_hash, TRUE);
[Y_index, Ys] = sort_by_vector(Y, Y_hash, TRUE);
X_blocks = lsh_find_blocks(Xs, num_hyperplanes);
Y_blocks = lsh_find_blocks(Ys, num_hyperplanes);
BLOCKS = cbind(X_blocks, Y_blocks)
B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], matrix(1, rows=1, cols=2));
BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
X_sorted = Xs[,2:ncol(Xs)];
X_hash = Xs[,1];
Y_sorted = Ys[,2:ncol(Ys)];
Y_hash = Ys[,1];
}