blob: 9ffb0280166c1ac8b98787ab3e8b7ca1283d3af3 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#------------------------------------------------------------------------------------------------------------------
# Implements builtin for deduplication using matching dependencies (e.g. Street 0.95, City 0.90 -> ZIP 1.0)
# and Jaccard distance.
#
# INPUT PARAMETERS:
# -----------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -----------------------------------------------------------------------------------------------------------------
# X Frame -- Input Frame X
# LHSfeatures Matrix[Integer] -- A matrix 1xd with numbers of columns for MDs
# (e.g. Street 0.95, City 0.90 -> ZIP 1.0)
# LHSthreshold Matrix[Double] -- A matrix 1xd with threshold values in interval [0, 1] for MDs
# RHSfeatures Matrix[Integer] -- A matrix 1xd with numbers of columns for MDs
# RHSthreshold Matrix[Double] -- A matrix 1xd with threshold values in interval [0, 1] for MDs
# verbose Boolean -- To print the output
# -----------------------------------------------------------------------------------------------------------------
#
# Output(s)
# -----------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# -----------------------------------------------------------------------------------------------------------------
# MD Matrix[Double] --- Matrix nx1 of duplicates
s_mdedup = function(Frame[String] X, Matrix[Double] LHSfeatures, Matrix[Double] LHSthreshold,
Matrix[Double] RHSfeatures, Matrix[Double] RHSthreshold, Boolean verbose)
return(Matrix[Double] MD)
{
n = nrow(X)
d = ncol(X)
if (0 > (ncol(LHSfeatures) + ncol(RHSfeatures)) > d)
stop("Invalid input: thresholds should in interval [0, " + d + "]")
if ((ncol(LHSfeatures) != ncol(LHSthreshold)) | (ncol(RHSfeatures) != ncol(RHSthreshold)))
stop("Invalid input: number of thresholds and columns to compare should be equal for LHS and RHS.")
if (max(LHSfeatures) > d | max(RHSfeatures) > d)
stop("Invalid input: feature values should be less than " + d)
if (sum(LHSthreshold > 1) > 0 | sum(RHSthreshold > 1) > 0)
stop("Invalid input: threshold values should be in the interval [0, 1].")
MD = matrix(0, n, 1)
LHS_MD = getMDAdjacency(X, LHSfeatures, LHSthreshold)
RHS_MD = matrix(0, n, n)
if (sum(LHS_MD) > 0) {
RHS_MD = getMDAdjacency(X, RHSfeatures, RHSthreshold)
}
MD = detectDuplicates(LHS_MD, RHS_MD)
if(verbose)
print(toString(MD))
}
getMDAdjacency = function(Frame[String] X, Matrix[Double] features, Matrix[Double] thresholds)
return(Matrix[Double] adjacency)
{
n = nrow(X)
d = ncol(X)
adjacency = matrix(0, n, n)
i = 1
while (i <= ncol(features)) {
# slice col
pos = as.scalar(features[1, i])
Xi = X[, pos]
# distances between words in each row of col
dist = map(Xi, "(x, y) -> UtilFunctions.jaccardSim(x, y)")
jaccardDist = as.matrix(dist)
jaccardDist = jaccardDist + t(jaccardDist)
threshold = as.scalar(thresholds[1, i])
if(i == 1) {
adjacency = jaccardDist >= threshold
} else {
adjacency = adjacency & (jaccardDist >= threshold)
}
# break if one of MDs is false
if (sum(adjacency) == 0)
i = ncol(features)
i = i + 1
}
}
detectDuplicates = function(Matrix[Double] LHS_adj, Matrix[Double] RHS_adj)
return(Matrix[Double] MD)
{
n = nrow(LHS_adj)
adjacency = LHS_adj * RHS_adj
# find duplicates
colDuplicates = components(G=adjacency[1:n, 1:n], verbose=FALSE)
MD = colDuplicates * (rowSums(adjacency[1:n, 1:n]) > 0)
}