blob: eb11df62790ae82fb4184d20f3c910f7a47e0be4 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# THIS SCRIPT PERFORMS AN ENTITY RESOLUTION PIPELINE FOR CLUSTERING ON A SINGLE FILE
# CONSISTS OF BLOCKING, MATCHING, AND CLUSTERING
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# FX String --- Location to read the frame of tokens in bow format
# Each line contains comma separated list of id, token and value
# OUT String --- Location to save the output of maching pairs
# Each line contains comma separated ids of one matched pair
# Third column provides the similarity score
# threshold Double 0.9 Threshold to be considered as a match
# blocking_method String naive Possible values: ["naive", "lsh"].
# num_blocks Int 1 Number of blocks for naive blocking
# num_hashtables Int 6 Number of hashtables for LSH blocking.
# num_hyperplanes Int 4 Number of hyperplanes for LSH blocking.
# use_tokens Boolean TRUE Whether to use the tokens of FX to generate predictions
# use_embeddings Boolean FALSE Whether to use the embeddings of XE to generate predictions
# XE String --- Location to read the frame of embedding matrix
# Required if use_embeddings is set to TRUE
# store_mapping Boolean FALSE Whether to store the mapping of transformencode
# MX String --- Location to write the frame of mapping
# Required if store_mapping is set to TRUE
# ---------------------------------------------------------------------------------------------
# OUTPUT: frame of maching pairs
# ---------------------------------------------------------------------------------------------
source("./scripts/staging/entity-resolution/primitives/preprocessing.dml") as pre;
source("./scripts/staging/entity-resolution/primitives/postprocessing.dml") as post;
source("./scripts/staging/entity-resolution/primitives/pipeline.dml") as pipe;
# Command Line Arguments
fileFX = $FX;
fileOUT = $OUT;
threshold = ifdef($threshold, 0.9);
blocking_method = ifdef($blocking_method, "lsh");
num_blocks = ifdef($num_blocks, 1);
num_hyperplanes = ifdef($num_hyperplanes, 4);
num_hashtables = ifdef($num_hashtables, 6);
use_tokens = ifdef($use_tokens, TRUE);
use_embeddings = ifdef($use_embeddings, FALSE);
# file XE is only required if using embeddings
fileXE = ifdef($XE, "");
# mapping file is required for evaluation
store_mapping = ifdef($store_mapping, FALSE);
fileMX = ifdef($MX, "");
if (!(blocking_method == "naive" | blocking_method == "lsh")) {
print("ERROR: blocking method must be in ['naive', 'lsh']");
}
# Read data
FX = read(fileFX);
if (use_embeddings) {
if (fileXE == "")
print("You need to specify file XE when use_embeddings is set to TRUE");
else
X_embeddings = read(fileXE);
}
# Convert data
[X, MX] = pre::convert_frame_tokens_to_matrix_bow(FX);
if (use_tokens & use_embeddings) {
X = cbind(X, X_embeddings);
} else if (use_tokens) {
# Nothing to do in this case, since X already contains tokens
} else if (use_embeddings) {
X = X_embeddings;
} else {
print("Either use_tokens or use_embeddings needs to be TRUE, using tokens only as default.");
}
if (store_mapping) {
if (fileMX == "")
print("You need to specify file MX when store_mapping is set to TRUE.");
else
write(MX, fileMX);
}
# Perform clustering
if (blocking_method == "naive") {
CLUSTER = pipe::entity_clustering_pipeline(X, num_blocks, threshold);
} else if (blocking_method == "lsh") {
CLUSTER = pipe::entity_clustering_pipeline_lsh(X, num_hashtables, num_hyperplanes, threshold);
}
MATCH = (CLUSTER > 0);
# Write results
sparse = post::untable(CLUSTER);
dec = transformdecode(target=sparse, meta=cbind(MX[,1],MX[,1]), spec="{recode:[C1,C2]}");
output = cbind(dec, as.frame(sparse[,3]));
write(output, fileOUT, sep=",", sparse=FALSE, format="csv");