scripts/staging/entity-resolution/primitives/blocking.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Splits the rows of X into num_blocks non-overlapping regions.
 # May produce less than num_blocks regions.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X             matrix  ---       A dataset with rows to split into blocks.
 # num_blocks    Integer ---       How many blocks to produce.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # BLOCKS         Double   A column vector with start indices for each block. Has one more row
 #                         than blocks such that the last row is the end index of the last block.
 # --------------------------------------------------------------------------------------------
 naive_blocking = function(Matrix[Double] X, Integer num_blocks) return (Matrix[Double] BLOCKS) {
   block_size_flt = nrow(X) / num_blocks;
   if (block_size_flt < 1.0) {
     BLOCKS= seq(1, nrow(X) + 1);
   } else {
     block_size = ceil(block_size_flt);
     BLOCKS = rbind(as.matrix(1), 1 +  block_size * seq(1, num_blocks));
     BLOCKS[num_blocks+1,] = nrow(X) + 1
   }
 }

 # Sorts the rows in dataset X by the vector v. The order is defined by the ascending order of v.
 # Can return the vector v as first column of X, depending on parameter prepend_v.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X             matrix  ---       Any matrix with rows to be sorted.
 # v             matrix  ---       A vector with the same number of rows as X. Defines ordering.
 # prepend_v     boolean ---       Whether to return v as first column of X.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_index       matrix  A vector with the original row number for each row in X.
 # X_sorted      matrix  The sorted input matrix X.
 # --------------------------------------------------------------------------------------------
 sort_by_vector = function(Matrix[Double] X, Matrix[Double] v, Boolean prepend_v) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
   X_index = order(target=v, by=1, decreasing = FALSE, index.return=TRUE);
   X_sorted = order(target=cbind(v, X), by=1, decreasing = FALSE, index.return=FALSE);
   if (!prepend_v) {
     X_sorted = X_sorted[,2:ncol(X_sorted)];
   }
 }

 # Sorts the rows in dataset X by their sum.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X             matrix  ---       Any matrix with rows to be sorted.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_index       matrix  A vector with the original row number for each row in X.
 # X_sorted      matrix  The sorted input matrix X.
 # --------------------------------------------------------------------------------------------
 row_sum_sorting = function(Matrix[Double] X) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
   X_rowSum = rowSums(X);
   [X_index, X_sorted] = sort_by_vector(X, X_rowSum, FALSE);
 }

 # Sorts the rows in dataset X by their original row numbers (index).
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X             matrix  ---       Any matrix with rows to be sorted.
 # X_index       matrix  ---       The orginal row numbers to be restored.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_reindex     matrix  The reindexed matrix X.
 # --------------------------------------------------------------------------------------------
 reindex_rowwise = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
   X_conc = cbind(X_index, X);
   X_reindex = order(target=X_conc, by=1, decreasing = FALSE, index.return=FALSE);
   # Remove index column
   X_reindex = X_reindex[,2:ncol(X_reindex)];
 }

 # Sorts both rows and columns in dataset X by their original row numbers (index).
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X             matrix  ---       Any matrix that should be resorted.
 # X_index       matrix  ---       The orginal row numbers to be restored.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_reindex     matrix  The reindexed matrix X.
 # --------------------------------------------------------------------------------------------
 reindex_rows_and_cols = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
   # First reindex rows
   X_reindex = X;
   X_reindex = reindex_rowwise(X_reindex, X_index);
   # Then transpose and repeat
   X_reindex = t(X_reindex);
   X_reindex = reindex_rowwise(X_reindex, X_index);
 }

 # Generates a random matrix of hyperplane parameters.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # num_hyperplanes  Integer ---       How many hyperplanes to generate.
 # dimension        Integer ---       The number of parameters per hyperplane.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # H              matrix   A num_hyperplanes x dimension matrix of hyperplane parameters.
 # --------------------------------------------------------------------------------------------
 gen_rand_hyperplanes = function(Integer num_hyperplanes, Integer dimension) return (Matrix[Double] H) {
   H = rand(rows=num_hyperplanes, cols=dimension, min=-1, max=1);
 }

 # Creates a scalar hash code for each row in X_plane by interpreting positive values as a
 # binary number.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X_plane         matrix  ---       The result of a plane equation for each row in X.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_hash        matrix   A row vector of scalar hash codes. Same number of rows as X_plane.
 # --------------------------------------------------------------------------------------------
 aggregate_lsh_code = function(Matrix[Double] X_code) return (Matrix[Double] X_hash) {
   X_pos = (X_code > 0);
   pos_weights = 2^seq(ncol(X_code)-1, 0);
   X_weights = X_pos * t(pos_weights);
   X_hash = rowSums(X_weights);
 }

 # Computes locality-sensitive hash codes for each row in X.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X               matrix  ---       A matrix that should be hashed.
 # H               matrix  ---       Hyperplanes matrix w. shape (num_hyperplanes, ncol(X))
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_hash        matrix   A column vector with hash codes. Same number of rows as X.
 # --------------------------------------------------------------------------------------------
 lsh = function(Matrix[Double] X, Matrix[Double] H) return (Matrix[Double] X_hash) {
   X_plane = X %*% t(H);
   X_hash = aggregate_lsh_code(X_plane);
 }

 lsh_find_blocks = function(Matrix[Double] X, Integer dimensions) return (Matrix[Double] BLOCKS){
   BLOCKS = matrix(0, rows=2^dimensions+1, cols=1)
   current_hash_value = 0.0;
   BLOCKS[1,1] = 1
   for (row_idx in 1:nrow(X)) {
     this_hash = as.scalar(X[row_idx,1]);
     if (current_hash_value < this_hash) {
       for (h in (current_hash_value+2):(this_hash+1)) {
         BLOCKS[h,1] = row_idx;
       }
       current_hash_value = this_hash;
     }
   }
   n_missing_rows = nrow(BLOCKS) - (current_hash_value + 1);
   BLOCKS[(current_hash_value+2):nrow(BLOCKS),] = matrix(nrow(X) + 1, rows=n_missing_rows, cols=1);
 }

 # Blocks the given matrix X by locality sensitive hashing using a set of random hyperplanes.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X               matrix  ---       A matrix that should be blocked by LSH.
 # num_hyperplanes Integer ---       How many hyperplanes to use.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_index       matrix  A vector with the original row number for each row in X.
 # X_hash        matrix  A column vector with the scalar LSH hash code for each row in X.
 # X_sorted      matrix  The sorted input matrix X.
 # BLOCKS        Double  A column vector with start indices for each block. Has one more row
 #                        than blocks such that the last row is the end index of the last block.
 # --------------------------------------------------------------------------------------------
 lsh_blocking = function(Matrix[Double] X, Integer num_hyperplanes) return (Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted, Matrix[Double] BLOCKS) {
   H = gen_rand_hyperplanes(num_hyperplanes, ncol(X));
   X_hash = lsh(X, H);
   [X_index, X_sorted_with_hash] = sort_by_vector(X, X_hash, TRUE);
   BLOCKS = lsh_find_blocks(X_sorted_with_hash, num_hyperplanes);
   B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], as.matrix(1));
   BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
   X_sorted = X_sorted_with_hash[,2:ncol(X_sorted_with_hash)];
   X_hash = X_sorted_with_hash[,1];
 }

 # Blocks the matrices X and Y by locality sensitive hashing using a set of random hyperplanes.
 #
 # INPUT PARAMETERS:
 # --------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT   MEANING
 # --------------------------------------------------------------------------------------------
 # X               matrix  ---       A matrix that should be blocked by LSH.
 # Y               matrix  ---       A matrix that should be blocked by LSH.
 # num_hyperplanes Integer ---       How many hyperplanes to use.
 #
 # Output:
 # --------------------------------------------------------------------------------------------
 # NAME          TYPE     MEANING
 # --------------------------------------------------------------------------------------------
 # X_index       matrix  A vector with the original row number for each row in X.
 # X_hash        matrix  A column vector with the scalar LSH hash code for each row in X.
 # X_sorted      matrix  The sorted input matrix X.
 # Y_index       matrix  A vector with the original row number for each row in Y.
 # Y_hash        matrix  A column vector with the scalar LSH hash code for each row in Y.
 # Y_sorted      matrix  The sorted input matrix Y.
 # BLOCKS        Double  A matrix with start indices for each block. Has one more row
 #                        than blocks such that the last row is the end index of the last block.
 #                        First column is X, second column Y indices.
 # --------------------------------------------------------------------------------------------
 lsh_blocking2 = function(Matrix[Double] X, Matrix[Double] Y, Integer num_hyperplanes) return (
         Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted,
         Matrix[Double] Y_index, Matrix[Double] Y_hash, Matrix[Double] Y_sorted,
         Matrix[Double] BLOCKS) {
   H = gen_rand_hyperplanes(num_hyperplanes, max(ncol(X), ncol(Y)));
   X_hash = lsh(X, H[,1:ncol(X)]);
   Y_hash = lsh(Y, H[,1:ncol(Y)]);
   [X_index, Xs] = sort_by_vector(X, X_hash, TRUE);
   [Y_index, Ys] = sort_by_vector(Y, Y_hash, TRUE);
   X_blocks = lsh_find_blocks(Xs, num_hyperplanes);
   Y_blocks = lsh_find_blocks(Ys, num_hyperplanes);
   BLOCKS = cbind(X_blocks, Y_blocks)
   B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], matrix(1, rows=1, cols=2));
   BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
   X_sorted = Xs[,2:ncol(Xs)];
   X_hash = Xs[,1];
   Y_sorted = Ys[,2:ncol(Ys)];
   Y_hash = Ys[,1];
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Splits the rows of X into num_blocks non-overlapping regions.
	# May produce less than num_blocks regions.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- A dataset with rows to split into blocks.
	# num_blocks Integer --- How many blocks to produce.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# BLOCKS Double A column vector with start indices for each block. Has one more row
	# than blocks such that the last row is the end index of the last block.
	# --------------------------------------------------------------------------------------------
	naive_blocking = function(Matrix[Double] X, Integer num_blocks) return (Matrix[Double] BLOCKS) {
	block_size_flt = nrow(X) / num_blocks;
	if (block_size_flt < 1.0) {
	BLOCKS= seq(1, nrow(X) + 1);
	} else {
	block_size = ceil(block_size_flt);
	BLOCKS = rbind(as.matrix(1), 1 + block_size * seq(1, num_blocks));
	BLOCKS[num_blocks+1,] = nrow(X) + 1
	}
	}

	# Sorts the rows in dataset X by the vector v. The order is defined by the ascending order of v.
	# Can return the vector v as first column of X, depending on parameter prepend_v.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- Any matrix with rows to be sorted.
	# v matrix --- A vector with the same number of rows as X. Defines ordering.
	# prepend_v boolean --- Whether to return v as first column of X.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_index matrix A vector with the original row number for each row in X.
	# X_sorted matrix The sorted input matrix X.
	# --------------------------------------------------------------------------------------------
	sort_by_vector = function(Matrix[Double] X, Matrix[Double] v, Boolean prepend_v) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
	X_index = order(target=v, by=1, decreasing = FALSE, index.return=TRUE);
	X_sorted = order(target=cbind(v, X), by=1, decreasing = FALSE, index.return=FALSE);
	if (!prepend_v) {
	X_sorted = X_sorted[,2:ncol(X_sorted)];
	}
	}

	# Sorts the rows in dataset X by their sum.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- Any matrix with rows to be sorted.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_index matrix A vector with the original row number for each row in X.
	# X_sorted matrix The sorted input matrix X.
	# --------------------------------------------------------------------------------------------
	row_sum_sorting = function(Matrix[Double] X) return (Matrix[Double] X_index, Matrix[Double] X_sorted) {
	X_rowSum = rowSums(X);
	[X_index, X_sorted] = sort_by_vector(X, X_rowSum, FALSE);
	}

	# Sorts the rows in dataset X by their original row numbers (index).
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- Any matrix with rows to be sorted.
	# X_index matrix --- The orginal row numbers to be restored.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_reindex matrix The reindexed matrix X.
	# --------------------------------------------------------------------------------------------
	reindex_rowwise = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
	X_conc = cbind(X_index, X);
	X_reindex = order(target=X_conc, by=1, decreasing = FALSE, index.return=FALSE);
	# Remove index column
	X_reindex = X_reindex[,2:ncol(X_reindex)];
	}

	# Sorts both rows and columns in dataset X by their original row numbers (index).
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- Any matrix that should be resorted.
	# X_index matrix --- The orginal row numbers to be restored.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_reindex matrix The reindexed matrix X.
	# --------------------------------------------------------------------------------------------
	reindex_rows_and_cols = function(Matrix[Double] X, Matrix[Double] X_index) return (Matrix[Double] X_reindex) {
	# First reindex rows
	X_reindex = X;
	X_reindex = reindex_rowwise(X_reindex, X_index);
	# Then transpose and repeat
	X_reindex = t(X_reindex);
	X_reindex = reindex_rowwise(X_reindex, X_index);
	}

	# Generates a random matrix of hyperplane parameters.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# num_hyperplanes Integer --- How many hyperplanes to generate.
	# dimension Integer --- The number of parameters per hyperplane.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# H matrix A num_hyperplanes x dimension matrix of hyperplane parameters.
	# --------------------------------------------------------------------------------------------
	gen_rand_hyperplanes = function(Integer num_hyperplanes, Integer dimension) return (Matrix[Double] H) {
	H = rand(rows=num_hyperplanes, cols=dimension, min=-1, max=1);
	}

	# Creates a scalar hash code for each row in X_plane by interpreting positive values as a
	# binary number.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X_plane matrix --- The result of a plane equation for each row in X.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_hash matrix A row vector of scalar hash codes. Same number of rows as X_plane.
	# --------------------------------------------------------------------------------------------
	aggregate_lsh_code = function(Matrix[Double] X_code) return (Matrix[Double] X_hash) {
	X_pos = (X_code > 0);
	pos_weights = 2^seq(ncol(X_code)-1, 0);
	X_weights = X_pos * t(pos_weights);
	X_hash = rowSums(X_weights);
	}

	# Computes locality-sensitive hash codes for each row in X.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- A matrix that should be hashed.
	# H matrix --- Hyperplanes matrix w. shape (num_hyperplanes, ncol(X))
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_hash matrix A column vector with hash codes. Same number of rows as X.
	# --------------------------------------------------------------------------------------------
	lsh = function(Matrix[Double] X, Matrix[Double] H) return (Matrix[Double] X_hash) {
	X_plane = X %*% t(H);
	X_hash = aggregate_lsh_code(X_plane);
	}

	lsh_find_blocks = function(Matrix[Double] X, Integer dimensions) return (Matrix[Double] BLOCKS){
	BLOCKS = matrix(0, rows=2^dimensions+1, cols=1)
	current_hash_value = 0.0;
	BLOCKS[1,1] = 1
	for (row_idx in 1:nrow(X)) {
	this_hash = as.scalar(X[row_idx,1]);
	if (current_hash_value < this_hash) {
	for (h in (current_hash_value+2):(this_hash+1)) {
	BLOCKS[h,1] = row_idx;
	}
	current_hash_value = this_hash;
	}
	}
	n_missing_rows = nrow(BLOCKS) - (current_hash_value + 1);
	BLOCKS[(current_hash_value+2):nrow(BLOCKS),] = matrix(nrow(X) + 1, rows=n_missing_rows, cols=1);
	}

	# Blocks the given matrix X by locality sensitive hashing using a set of random hyperplanes.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- A matrix that should be blocked by LSH.
	# num_hyperplanes Integer --- How many hyperplanes to use.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_index matrix A vector with the original row number for each row in X.
	# X_hash matrix A column vector with the scalar LSH hash code for each row in X.
	# X_sorted matrix The sorted input matrix X.
	# BLOCKS Double A column vector with start indices for each block. Has one more row
	# than blocks such that the last row is the end index of the last block.
	# --------------------------------------------------------------------------------------------
	lsh_blocking = function(Matrix[Double] X, Integer num_hyperplanes) return (Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted, Matrix[Double] BLOCKS) {
	H = gen_rand_hyperplanes(num_hyperplanes, ncol(X));
	X_hash = lsh(X, H);
	[X_index, X_sorted_with_hash] = sort_by_vector(X, X_hash, TRUE);
	BLOCKS = lsh_find_blocks(X_sorted_with_hash, num_hyperplanes);
	B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], as.matrix(1));
	BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
	X_sorted = X_sorted_with_hash[,2:ncol(X_sorted_with_hash)];
	X_hash = X_sorted_with_hash[,1];
	}

	# Blocks the matrices X and Y by locality sensitive hashing using a set of random hyperplanes.
	#
	# INPUT PARAMETERS:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# --------------------------------------------------------------------------------------------
	# X matrix --- A matrix that should be blocked by LSH.
	# Y matrix --- A matrix that should be blocked by LSH.
	# num_hyperplanes Integer --- How many hyperplanes to use.
	#
	# Output:
	# --------------------------------------------------------------------------------------------
	# NAME TYPE MEANING
	# --------------------------------------------------------------------------------------------
	# X_index matrix A vector with the original row number for each row in X.
	# X_hash matrix A column vector with the scalar LSH hash code for each row in X.
	# X_sorted matrix The sorted input matrix X.
	# Y_index matrix A vector with the original row number for each row in Y.
	# Y_hash matrix A column vector with the scalar LSH hash code for each row in Y.
	# Y_sorted matrix The sorted input matrix Y.
	# BLOCKS Double A matrix with start indices for each block. Has one more row
	# than blocks such that the last row is the end index of the last block.
	# First column is X, second column Y indices.
	# --------------------------------------------------------------------------------------------
	lsh_blocking2 = function(Matrix[Double] X, Matrix[Double] Y, Integer num_hyperplanes) return (
	Matrix[Double] X_index, Matrix[Double] X_hash, Matrix[Double] X_sorted,
	Matrix[Double] Y_index, Matrix[Double] Y_hash, Matrix[Double] Y_sorted,
	Matrix[Double] BLOCKS) {
	H = gen_rand_hyperplanes(num_hyperplanes, max(ncol(X), ncol(Y)));
	X_hash = lsh(X, H[,1:ncol(X)]);
	Y_hash = lsh(Y, H[,1:ncol(Y)]);
	[X_index, Xs] = sort_by_vector(X, X_hash, TRUE);
	[Y_index, Ys] = sort_by_vector(Y, Y_hash, TRUE);
	X_blocks = lsh_find_blocks(Xs, num_hyperplanes);
	Y_blocks = lsh_find_blocks(Ys, num_hyperplanes);
	BLOCKS = cbind(X_blocks, Y_blocks)
	B_lengths = rbind(BLOCKS[2:nrow(BLOCKS),] - BLOCKS[1:nrow(BLOCKS)-1,], matrix(1, rows=1, cols=2));
	BLOCKS = removeEmpty(target=BLOCKS, margin="rows", select=B_lengths)
	X_sorted = Xs[,2:ncol(Xs)];
	X_hash = Xs[,1];
	Y_sorted = Ys[,2:ncol(Ys)];
	Y_hash = Ys[,1];
	}