blob: 26f051f6164c30832e5aa620b1b8364701ddd2b2 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# ----------------------------------------------------------------------------
# References:
# Fred J. Damerau. 1964.
# A technique for computer detection and correction of spelling errors.
# Commun. ACM 7, 3 (March 1964), 171–176.
# DOI:https://doi.org/10.1145/363958.363994
# ----------------------------------------------------------------------------
# Builtin function that corrects corrupted frames of strings
# This algorithm operates on the assumption that most strings are correct
# and simply swaps strings that do not occur often with similar strings that
# occur more often
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# strings String --- The nx1 input frame of corrupted strings
# frequency_threshold Double 0.05 Strings that occur above this frequency level will not be corrected
# distance_threshold integer 2 Max distance at which strings are considered similar
# decapitalize Boolean TRUE Decapitalize all strings before correction
# correct Boolean TRUE Correct strings or only report potential errors
# is_verbose Boolean FALSE Print debug information
#
#
# RETURN VALUES
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# Y Frame - Corrected nx1 output frame
# ----------------------------------------------------------------------------
# TODO: future: add parameter for list of words that are sure to be correct
s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double frequency_threshold=0.05, Integer distance_threshold=2,
Boolean decapitalize=TRUE, Boolean correct=TRUE, Boolean is_verbose=FALSE)
return (Frame[String] Y)
{
if(is_verbose)
print ("BEGIN CORRECT-TYPOS SCRIPT");
num_strings = length(strings);
if(is_verbose)
print("num strings: " + num_strings + "\n")
if (decapitalize)
strings = map(strings, "s -> s.toLowerCase()");
if(nrow(strings) != nrow(nullMask) | ncol(strings) != ncol(nullMask))
stop("Dimension mismatch: data dimensions do not match with mask dimensions")
Y = strings
# build dictionary
dict = buildDictionary(strings);
strings = dict[,1];
frequencies = as.matrix(dict[,2]) / num_strings;
lengths = as.matrix(map(strings, "s -> s.length()"));
num_different_strings = nrow(strings);
if (is_verbose) {
print("dict:" )
print(toString(dict));
print("frequencies: ");
print(toString(frequencies));
print("lengths:")
print(toString(lengths))
}
# generate ascii matrix
max_len = max(lengths);
if (is_verbose) {
print("max_len: " + max_len + "\n");
}
# TODO: when proper lambda expressions are supported: rewrite in not so hacky
ascii_matrix = matrix(0, rows = max_len, cols = num_different_strings)
parfor (i in 1:num_different_strings) {
for (j in 1:as.scalar(lengths[i, 1])) {
tmp = as.matrix(map(strings[i,], "s -> UtilFunctions.getAsciiAtIdx(s, " + j + ")"));
ascii_matrix[j, i] = tmp[1, 1];
}
}
if (is_verbose) {
print("ascii_matrix: ")
print(toString(ascii_matrix));
}
# create upper triangular matrix with distances
distance_matrix = matrix(0, rows=num_different_strings, cols=num_different_strings);
parfor (i in 1:num_different_strings) {
parfor (j in i:num_different_strings) {
if (i != j) {
if(abs(as.scalar(lengths[i, 1]) - as.scalar(lengths[j , 1])) >= distance_threshold) {
distance_matrix[i, j] = 42000;
} else {
A = ascii_matrix[1:as.scalar(lengths[i,1]), i];
B = ascii_matrix[1:as.scalar(lengths[j,1]), j];
d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE);
distance_matrix[i, j] = ifelse(d == -1, 42000, d);
}
}
}
}
upper_triangle = upper.tri(target=distance_matrix, values=TRUE);
distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, num_different_strings, 1));
sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
if (is_verbose) {
print("DISTANCE MATRIX: ");
print(toString(distance_matrix));
print("sorted frequency idxs: ");
print(toString(sorted_frequency_idxs));
}
# correct strings
for (i in 1:num_different_strings) {
idx = as.integer(as.scalar(sorted_frequency_idxs[i])); # lowest frequency idx
frequency = as.scalar(frequencies[idx]);
if (is_verbose) print("idx: " + idx + " - frequency: " + frequency);
if (frequency < frequency_threshold) {
min_idxs = t(order(target=t(distance_matrix[idx,]), index.return=TRUE));
j = 1;
break=FALSE;
while (j <= num_different_strings & !break) {
min_idx = as.integer(as.scalar(min_idxs[,j]));
min = as.integer(as.scalar(distance_matrix[idx, min_idx]));
replacement_frequency = as.scalar(frequencies[min_idx]);
# TODO: additional parameter for replacement_frequency?
if (min < distance_threshold & replacement_frequency > frequency_threshold/2) {
to_replace = as.scalar(strings[idx,]);
replacement = as.scalar(strings[min_idx,]);
if (is_verbose|!correct) print("Replacement: " + to_replace + "(" + frequency + ")" + " with " +
replacement + "(" + replacement_frequency + ")" + " dist: " + min);
if (correct)
Y = replaceStrings(replacement, to_replace, Y);
break=TRUE;
}
j += 1;
}
}
}
if (is_verbose) {
print("Corrected Strings: ");
print(toString(Y));
}
}
replaceStrings = function(String replacement, String to_replace, Frame[String] strings)
return(Frame[String] strings)
{
strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}
buildDictionary = function(Frame[String] S)
return(Frame[Unknown] dict)
{
[ID,M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
dstr = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
dcodes = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[1]");
frequencies = table(seq(1,nrow(dstr)),as.matrix(dcodes)) %*% table(ID, 1);
dict = cbind(dstr, as.frame(frequencies));
}
damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, double bound, Boolean is_verbose)
return(double dl_distance) {
dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1);
dl_matrix[length(A) + 1, length(B) + 1] = -1;
dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1);
dl_matrix[2, 1] = 1;
for (j in 2:length(B) + 1) {
cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1]))
dl_matrix[2, j] = min(min(
dl_matrix[2, j - 1] + 1,
dl_matrix[1, j] + 1),
dl_matrix[1, j - 1] + cost);
}
i = 2;
break_condition = FALSE;
while (i < length(A) + 1 & !break_condition) {
i += 1;
dl_matrix[i, 1] = i - 1;
cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1]))
dl_matrix[i, 2] = min(min(
dl_matrix[i - 1, 2] + 1,
dl_matrix[i, 1] + 1),
dl_matrix[i - 1, 1] + cost);
for (j in 3:length(B) + 1) {
cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1]))
if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) {
dl_matrix[i, j] = min(min(
dl_matrix[i, j - 1] + 1,
dl_matrix[i - 1, j] + 1), min(
dl_matrix[i - 1, j - 1] + cost,
dl_matrix[i - 2, j - 2] + 1));
} else {
dl_matrix[i, j] = min(min(
dl_matrix[i, j - 1] + 1,
dl_matrix[i - 1, j] + 1),
dl_matrix[i - 1, j - 1] + cost);
}
}
break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound;
}
if (is_verbose){
print("dl distance matrix:")
print(toString(dl_matrix));
}
dl_distance = as.scalar(dl_matrix[length(A) + 1, length(B) + 1]);
}