blob: addc75940eab38d3da0e95c1fa92d14acc05dbf6 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Corrects corrupted frames of strings
# This algorithm operates on the assumption that most strings are correct
# and simply swaps strings that do not occur often with similar strings that
# occur more often
#
# .. code-block::
#
# References:
# Fred J. Damerau. 1964.
# A technique for computer detection and correction of spelling errors.
# Commun. ACM 7, 3 (March 1964), 171–176.
# DOI:https://doi.org/10.1145/363958.363994
#
# TODO: future: add parameter for list of words that are sure to be correct
#
# INPUT:
# ----------------------------------------------------------------------------------------
# strings The nx1 input frame of corrupted strings
# nullMask ---
# frequency_threshold Strings that occur above this frequency level will not be corrected
# distance_threshold Max distance at which strings are considered similar
# distance matrix ---
# dict ---
# ----------------------------------------------------------------------------------------
#
# OUTPUT:
# ---------------------------------------------------------------------------------------------
# Y Corrected nx1 output frame
# ---------------------------------------------------------------------------------------------
s_correctTyposApply = function(Frame[String] strings, Double frequency_threshold = 0.05, Integer distance_threshold = 2, Matrix[Double] distance_matrix, Frame[Unknown] dict)
return (Frame[String] Y)
{
strings = map(strings, "s -> s.toLowerCase()");
Y = strings
frequencies = as.matrix(dict[,2]) / length(strings);
strings = dict[,1];
num_different_strings = nrow(strings)
sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
# correct strings
for (i in 1:num_different_strings) {
idx = as.integer(as.scalar(sorted_frequency_idxs[i])); # lowest frequency idx
frequency = as.scalar(frequencies[idx]);
if (frequency < frequency_threshold) {
min_idxs = t(order(target=t(distance_matrix[idx,]), index.return=TRUE));
j = 1;
break=FALSE;
while (j <= num_different_strings & !break) {
min_idx = as.integer(as.scalar(min_idxs[,j]));
min = as.integer(as.scalar(distance_matrix[idx, min_idx]));
replacement_frequency = as.scalar(frequencies[min_idx]);
# TODO: additional parameter for replacement_frequency?
if (min < distance_threshold & replacement_frequency > frequency_threshold/2) {
to_replace = as.scalar(strings[idx,]);
replacement = as.scalar(strings[min_idx,]);
Y = replaceStrings1(replacement, to_replace, Y);
break=TRUE;
}
j += 1;
}
}
}
}
replaceStrings1 = function(String replacement, String to_replace, Frame[String] strings)
return(Frame[String] strings)
{
strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}