scripts/staging/entity-resolution/binary-entity-resolution.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT PERFORMS AN ENTITY RESOLUTION PIPELINE FOR BINARY MATCHING ON TWO FILES
 #
 # INPUT PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME           TYPE   DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # FX              String  ---     Location to read the frame of tokens in bow format for the first dataset
 #                                 Each line contains comma separated list of id, token and value
 # FY              String  ---     Location to read the frame of tokens in bow format for the second dataset
 #                                 Each line contains comma separated list of id, token and value
 # OUT             String  ---     Location to save the output of maching pairs
 #                                 Each line contains comma separated ids of one matched pair
 #                                 First column is for the first dataset, while second columns is the id of the second one
 #                                 Third column provides the similarity score
 # threshold       Double  0.9     Threshold to be considered as a match
 # num_hashtables  Int     6       Number of hashtables for LSH blocking.
 # num_hyperplanes Int     4       Number of hyperplanes for LSH blocking.
 # use_tokens      Boolean TRUE    Whether to use the tokens of FX and FY to generate predictions
 # use_embeddings  Boolean FALSE   Whether to use the embeddings of XE and YE to generate predictions
 # XE              String  ---     Location to read the frame of embedding matrix for the first dataset
 #                                 Required if use_embeddings is set to TRUE
 # YE              String  ---     Location to read the frame of embedding matrix for the second dataset
 #                                Required if use_embeddings is set to TRUE
 # ---------------------------------------------------------------------------------------------
 # OUTPUT: frame of maching pairs
 # ---------------------------------------------------------------------------------------------

 source("./scripts/staging/entity-resolution/primitives/postprocessing.dml") as post;
 source("./scripts/staging/entity-resolution/primitives/preprocessing.dml") as pre;
 source("./scripts/staging/entity-resolution/primitives/pipeline.dml") as pipe;

 # Command Line Arguments
 fileFX = $FX;
 fileFY = $FY;
 fileOUT = $OUT;

 threshold = ifdef($threshold, 0.9);
 num_hashtables = ifdef($num_hashtables, 6);
 num_hyperplanes = ifdef($num_hyperplanes, 4);

 use_tokens = ifdef($use_tokens, TRUE);
 use_embeddings = ifdef($use_embeddings, FALSE);
 # file XE and YE is only required if using embeddings
 fileXE = ifdef($XE, "");
 fileYE = ifdef($YE, "");

 # Read data
 FX = read(fileFX);
 FY = read(fileFY);
 if (use_embeddings) {
   if (fileXE == "" | fileYE == "") {
     print("You need to specify file XE and XY when use_embeddings is set to TRUE");
   } else {
     X_embeddings = read(fileXE);
     Y_embeddings = read(fileYE);
   }
 }

 # Convert data
 [X, Y, M_tokens, MX_ids, MY_ids] = pre::convert_frame_tokens_to_matrix_bow_2(FX,FY);
 if (use_tokens & use_embeddings) {
   X = cbind(X, X_embeddings);
   Y = cbind(Y, Y_embeddings);
 } else if (use_tokens) {
   # Nothing to do in this case, since X already contains tokens
 } else if (use_embeddings) {
   X = X_embeddings;
   Y = Y_embeddings;
 } else {
   print("Either use_tokens or use_embeddings needs to be TRUE, using tokens only as default.");
 }
 # Perform matching
 THRES = pipe::binary_entity_resolution_pipeline_lsh(X, Y, num_hashtables, num_hyperplanes, threshold);
 sparse = post::untable(THRES);

 # Write results
 X_dec = transformdecode(target=sparse[,1], meta=MX_ids[,1], spec="{recode:[C1]}");
 Y_dec = transformdecode(target=sparse[,2], meta=MY_ids[,1], spec="{recode:[C1]}");
 output = cbind(cbind(X_dec, Y_dec), as.frame(sparse[,3]));
 write(output, fileOUT, sep=",", sparse=FALSE, format="csv");
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT PERFORMS AN ENTITY RESOLUTION PIPELINE FOR BINARY MATCHING ON TWO FILES
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# FX String --- Location to read the frame of tokens in bow format for the first dataset
	# Each line contains comma separated list of id, token and value
	# FY String --- Location to read the frame of tokens in bow format for the second dataset
	# Each line contains comma separated list of id, token and value
	# OUT String --- Location to save the output of maching pairs
	# Each line contains comma separated ids of one matched pair
	# First column is for the first dataset, while second columns is the id of the second one
	# Third column provides the similarity score
	# threshold Double 0.9 Threshold to be considered as a match
	# num_hashtables Int 6 Number of hashtables for LSH blocking.
	# num_hyperplanes Int 4 Number of hyperplanes for LSH blocking.
	# use_tokens Boolean TRUE Whether to use the tokens of FX and FY to generate predictions
	# use_embeddings Boolean FALSE Whether to use the embeddings of XE and YE to generate predictions
	# XE String --- Location to read the frame of embedding matrix for the first dataset
	# Required if use_embeddings is set to TRUE
	# YE String --- Location to read the frame of embedding matrix for the second dataset
	# Required if use_embeddings is set to TRUE
	# ---------------------------------------------------------------------------------------------
	# OUTPUT: frame of maching pairs
	# ---------------------------------------------------------------------------------------------

	source("./scripts/staging/entity-resolution/primitives/postprocessing.dml") as post;
	source("./scripts/staging/entity-resolution/primitives/preprocessing.dml") as pre;
	source("./scripts/staging/entity-resolution/primitives/pipeline.dml") as pipe;

	# Command Line Arguments
	fileFX = $FX;
	fileFY = $FY;
	fileOUT = $OUT;

	threshold = ifdef($threshold, 0.9);
	num_hashtables = ifdef($num_hashtables, 6);
	num_hyperplanes = ifdef($num_hyperplanes, 4);

	use_tokens = ifdef($use_tokens, TRUE);
	use_embeddings = ifdef($use_embeddings, FALSE);
	# file XE and YE is only required if using embeddings
	fileXE = ifdef($XE, "");
	fileYE = ifdef($YE, "");

	# Read data
	FX = read(fileFX);
	FY = read(fileFY);
	if (use_embeddings) {
	if (fileXE == "" \| fileYE == "") {
	print("You need to specify file XE and XY when use_embeddings is set to TRUE");
	} else {
	X_embeddings = read(fileXE);
	Y_embeddings = read(fileYE);
	}
	}

	# Convert data
	[X, Y, M_tokens, MX_ids, MY_ids] = pre::convert_frame_tokens_to_matrix_bow_2(FX,FY);
	if (use_tokens & use_embeddings) {
	X = cbind(X, X_embeddings);
	Y = cbind(Y, Y_embeddings);
	} else if (use_tokens) {
	# Nothing to do in this case, since X already contains tokens
	} else if (use_embeddings) {
	X = X_embeddings;
	Y = Y_embeddings;
	} else {
	print("Either use_tokens or use_embeddings needs to be TRUE, using tokens only as default.");
	}
	# Perform matching
	THRES = pipe::binary_entity_resolution_pipeline_lsh(X, Y, num_hashtables, num_hyperplanes, threshold);
	sparse = post::untable(THRES);

	# Write results
	X_dec = transformdecode(target=sparse[,1], meta=MX_ids[,1], spec="{recode:[C1]}");
	Y_dec = transformdecode(target=sparse[,2], meta=MY_ids[,1], spec="{recode:[C1]}");
	output = cbind(cbind(X_dec, Y_dec), as.frame(sparse[,3]));
	write(output, fileOUT, sep=",", sparse=FALSE, format="csv");