scripts/builtin/correctTypos.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # ----------------------------------------------------------------------------
 # References:
 # Fred J. Damerau. 1964.
 #   A technique for computer detection and correction of spelling errors.
 #   Commun. ACM 7, 3 (March 1964), 171–176.
 #   DOI:https://doi.org/10.1145/363958.363994
 # ----------------------------------------------------------------------------

 # Builtin function that corrects corrupted frames of strings
 # This algorithm operates on the assumption that most strings are correct
 # and simply swaps strings that do not occur often with similar strings that
 # occur more often
 #
 # INPUT PARAMETERS:
 # ----------------------------------------------------------------------------
 # NAME                              TYPE      DEFAULT  MEANING
 # ----------------------------------------------------------------------------
 # strings                           String    ---      The nx1 input frame of corrupted strings
 # frequency_threshold               Double    0.05     Strings that occur above this frequency level will not be corrected
 # distance_threshold                integer   2        Max distance at which strings are considered similar
 # decapitalize                      Boolean   TRUE     Decapitalize all strings before correction
 # correct                           Boolean   TRUE     Correct strings or only report potential errors
 # is_verbose                        Boolean   FALSE    Print debug information
 #
 #
 # RETURN VALUES
 # ----------------------------------------------------------------------------
 # NAME     TYPE      DEFAULT  MEANING
 # ----------------------------------------------------------------------------
 # Y        Frame        -     Corrected nx1 output frame
 # ----------------------------------------------------------------------------

 # TODO: future: add parameter for list of words that are sure to be correct

 s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double frequency_threshold=0.05, Integer distance_threshold=2,
     Boolean decapitalize=TRUE, Boolean correct=TRUE, Boolean is_verbose=FALSE)
   return (Frame[String] Y)
 {
   if(is_verbose)
     print ("BEGIN CORRECT-TYPOS SCRIPT");
   num_strings = length(strings);

   if(is_verbose)
     print("num strings: " + num_strings + "\n")

   if (decapitalize)
     strings = map(strings, "s -> s.toLowerCase()");

   if(nrow(strings) != nrow(nullMask) | ncol(strings) != ncol(nullMask))
     stop("Dimension mismatch: data dimensions do not match with mask dimensions")
   Y = strings

   # build dictionary
   dict = buildDictionary(strings);
   strings = dict[,1];
   frequencies = as.matrix(dict[,2]) / num_strings;
   lengths = as.matrix(map(strings, "s -> s.length()"));

   num_different_strings = nrow(strings);
   if (is_verbose) {
     print("dict:" )
     print(toString(dict));
     print("frequencies: ");
     print(toString(frequencies));
     print("lengths:")
     print(toString(lengths))
   }

   # generate ascii matrix
   max_len = max(lengths);
   if (is_verbose) {
     print("max_len: " + max_len + "\n");
   }
   # TODO: when proper lambda expressions are supported: rewrite in not so hacky
   ascii_matrix = matrix(0, rows = max_len, cols = num_different_strings)
   parfor (i in 1:num_different_strings) {
     for (j in 1:as.scalar(lengths[i, 1])) {
       tmp = as.matrix(map(strings[i,], "s -> UtilFunctions.getAsciiAtIdx(s, " + j + ")"));
       ascii_matrix[j, i] = tmp[1, 1];
     }
   }
   if (is_verbose) {
     print("ascii_matrix: ")
     print(toString(ascii_matrix));
   }

   # create upper triangular matrix with distances
   distance_matrix = matrix(0, rows=num_different_strings, cols=num_different_strings);
   parfor (i in 1:num_different_strings) {
     parfor (j in i:num_different_strings) {
       if (i != j) {
         if(abs(as.scalar(lengths[i, 1]) - as.scalar(lengths[j , 1])) >= distance_threshold) {
           distance_matrix[i, j] = 42000;
         } else {
           A = ascii_matrix[1:as.scalar(lengths[i,1]), i];
           B = ascii_matrix[1:as.scalar(lengths[j,1]), j];
           d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE);
           distance_matrix[i, j] = ifelse(d == -1, 42000, d);
         }
       }
     }
   }
   upper_triangle = upper.tri(target=distance_matrix, values=TRUE);
   distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, num_different_strings, 1));

   sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
   if (is_verbose) {
     print("DISTANCE MATRIX: ");
     print(toString(distance_matrix));
     print("sorted frequency idxs: ");
     print(toString(sorted_frequency_idxs));
   }

   # correct strings
   for (i in 1:num_different_strings) {
     idx = as.integer(as.scalar(sorted_frequency_idxs[i])); # lowest frequency idx
     frequency = as.scalar(frequencies[idx]);
     if (is_verbose) print("idx: " + idx + " - frequency: " + frequency);
     if (frequency < frequency_threshold) {
       min_idxs = t(order(target=t(distance_matrix[idx,]), index.return=TRUE));

       j = 1;
       break=FALSE;
       while (j <= num_different_strings & !break) {
         min_idx = as.integer(as.scalar(min_idxs[,j]));
         min = as.integer(as.scalar(distance_matrix[idx, min_idx]));
         replacement_frequency = as.scalar(frequencies[min_idx]);

         # TODO: additional parameter for replacement_frequency?
         if (min < distance_threshold & replacement_frequency > frequency_threshold/2) {
           to_replace = as.scalar(strings[idx,]);
           replacement = as.scalar(strings[min_idx,]);
           if (is_verbose|!correct) print("Replacement: " + to_replace + "(" + frequency + ")" + " with " +
             replacement + "(" + replacement_frequency + ")" + " dist: " + min);
           if (correct)
             Y = replaceStrings(replacement, to_replace, Y);
           break=TRUE;
         }
         j += 1;
       }
     }
   }
   if (is_verbose) {
     print("Corrected Strings: ");
     print(toString(Y));
   }
 }

 replaceStrings = function(String replacement, String to_replace, Frame[String] strings)
   return(Frame[String] strings)
 {
   strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
 }

 buildDictionary = function(Frame[String] S)
   return(Frame[Unknown] dict)
 {
   [ID,M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
   dstr = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
   dcodes = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[1]");
   frequencies = table(seq(1,nrow(dstr)),as.matrix(dcodes)) %*% table(ID, 1);
   dict = cbind(dstr, as.frame(frequencies));
 }

 damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, double bound, Boolean is_verbose)
   return(double dl_distance) {

   dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1);
   dl_matrix[length(A) + 1, length(B) + 1] = -1;
   dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1);
   dl_matrix[2, 1] = 1;

   for (j in 2:length(B) + 1) {
     cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1]))
     dl_matrix[2, j] = min(min(
       dl_matrix[2, j - 1] + 1,
       dl_matrix[1, j] + 1),
       dl_matrix[1, j - 1] + cost);
   }

   i = 2;
   break_condition = FALSE;
   while (i < length(A) + 1 & !break_condition) {
     i += 1;

     dl_matrix[i, 1] = i - 1;
     cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1]))
     dl_matrix[i, 2] = min(min(
       dl_matrix[i - 1, 2] + 1,
       dl_matrix[i, 1] + 1),
       dl_matrix[i - 1, 1] + cost);

     for (j in 3:length(B) + 1) {
       cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1]))
       if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) {
         dl_matrix[i, j] = min(min(
           dl_matrix[i, j - 1] + 1,
           dl_matrix[i - 1, j] + 1), min(
           dl_matrix[i - 1, j - 1] + cost,
           dl_matrix[i - 2, j - 2] +  1));
       } else {
         dl_matrix[i, j] = min(min(
           dl_matrix[i, j - 1] + 1,
           dl_matrix[i - 1, j] + 1),
           dl_matrix[i - 1, j - 1] + cost);
       }
     }

     break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound;
   }

   if (is_verbose){
     print("dl distance matrix:")
     print(toString(dl_matrix));
   }

   dl_distance = as.scalar(dl_matrix[length(A) + 1, length(B) + 1]);
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# ----------------------------------------------------------------------------
	# References:
	# Fred J. Damerau. 1964.
	# A technique for computer detection and correction of spelling errors.
	# Commun. ACM 7, 3 (March 1964), 171–176.
	# DOI:https://doi.org/10.1145/363958.363994
	# ----------------------------------------------------------------------------

	# Builtin function that corrects corrupted frames of strings
	# This algorithm operates on the assumption that most strings are correct
	# and simply swaps strings that do not occur often with similar strings that
	# occur more often
	#
	# INPUT PARAMETERS:
	# ----------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ----------------------------------------------------------------------------
	# strings String --- The nx1 input frame of corrupted strings
	# frequency_threshold Double 0.05 Strings that occur above this frequency level will not be corrected
	# distance_threshold integer 2 Max distance at which strings are considered similar
	# decapitalize Boolean TRUE Decapitalize all strings before correction
	# correct Boolean TRUE Correct strings or only report potential errors
	# is_verbose Boolean FALSE Print debug information
	#
	#
	# RETURN VALUES
	# ----------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ----------------------------------------------------------------------------
	# Y Frame - Corrected nx1 output frame
	# ----------------------------------------------------------------------------

	# TODO: future: add parameter for list of words that are sure to be correct

	s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double frequency_threshold=0.05, Integer distance_threshold=2,
	Boolean decapitalize=TRUE, Boolean correct=TRUE, Boolean is_verbose=FALSE)
	return (Frame[String] Y)
	{
	if(is_verbose)
	print ("BEGIN CORRECT-TYPOS SCRIPT");
	num_strings = length(strings);

	if(is_verbose)
	print("num strings: " + num_strings + "\n")

	if (decapitalize)
	strings = map(strings, "s -> s.toLowerCase()");

	if(nrow(strings) != nrow(nullMask) \| ncol(strings) != ncol(nullMask))
	stop("Dimension mismatch: data dimensions do not match with mask dimensions")
	Y = strings

	# build dictionary
	dict = buildDictionary(strings);
	strings = dict[,1];
	frequencies = as.matrix(dict[,2]) / num_strings;
	lengths = as.matrix(map(strings, "s -> s.length()"));

	num_different_strings = nrow(strings);
	if (is_verbose) {
	print("dict:" )
	print(toString(dict));
	print("frequencies: ");
	print(toString(frequencies));
	print("lengths:")
	print(toString(lengths))
	}

	# generate ascii matrix
	max_len = max(lengths);
	if (is_verbose) {
	print("max_len: " + max_len + "\n");
	}
	# TODO: when proper lambda expressions are supported: rewrite in not so hacky
	ascii_matrix = matrix(0, rows = max_len, cols = num_different_strings)
	parfor (i in 1:num_different_strings) {
	for (j in 1:as.scalar(lengths[i, 1])) {
	tmp = as.matrix(map(strings[i,], "s -> UtilFunctions.getAsciiAtIdx(s, " + j + ")"));
	ascii_matrix[j, i] = tmp[1, 1];
	}
	}
	if (is_verbose) {
	print("ascii_matrix: ")
	print(toString(ascii_matrix));
	}

	# create upper triangular matrix with distances
	distance_matrix = matrix(0, rows=num_different_strings, cols=num_different_strings);
	parfor (i in 1:num_different_strings) {
	parfor (j in i:num_different_strings) {
	if (i != j) {
	if(abs(as.scalar(lengths[i, 1]) - as.scalar(lengths[j , 1])) >= distance_threshold) {
	distance_matrix[i, j] = 42000;
	} else {
	A = ascii_matrix[1:as.scalar(lengths[i,1]), i];
	B = ascii_matrix[1:as.scalar(lengths[j,1]), j];
	d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE);
	distance_matrix[i, j] = ifelse(d == -1, 42000, d);
	}
	}
	}
	}
	upper_triangle = upper.tri(target=distance_matrix, values=TRUE);
	distance_matrix = distance_matrix + t(upper_triangle) + diag(matrix(42000, num_different_strings, 1));

	sorted_frequency_idxs = order(target=frequencies, index.return=TRUE);
	if (is_verbose) {
	print("DISTANCE MATRIX: ");
	print(toString(distance_matrix));
	print("sorted frequency idxs: ");
	print(toString(sorted_frequency_idxs));
	}

	# correct strings
	for (i in 1:num_different_strings) {
	idx = as.integer(as.scalar(sorted_frequency_idxs[i])); # lowest frequency idx
	frequency = as.scalar(frequencies[idx]);
	if (is_verbose) print("idx: " + idx + " - frequency: " + frequency);
	if (frequency < frequency_threshold) {
	min_idxs = t(order(target=t(distance_matrix[idx,]), index.return=TRUE));

	j = 1;
	break=FALSE;
	while (j <= num_different_strings & !break) {
	min_idx = as.integer(as.scalar(min_idxs[,j]));
	min = as.integer(as.scalar(distance_matrix[idx, min_idx]));
	replacement_frequency = as.scalar(frequencies[min_idx]);

	# TODO: additional parameter for replacement_frequency?
	if (min < distance_threshold & replacement_frequency > frequency_threshold/2) {
	to_replace = as.scalar(strings[idx,]);
	replacement = as.scalar(strings[min_idx,]);
	if (is_verbose\|!correct) print("Replacement: " + to_replace + "(" + frequency + ")" + " with " +
	replacement + "(" + replacement_frequency + ")" + " dist: " + min);
	if (correct)
	Y = replaceStrings(replacement, to_replace, Y);
	break=TRUE;
	}
	j += 1;
	}
	}
	}
	if (is_verbose) {
	print("Corrected Strings: ");
	print(toString(Y));
	}
	}

	replaceStrings = function(String replacement, String to_replace, Frame[String] strings)
	return(Frame[String] strings)
	{
	strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
	}

	buildDictionary = function(Frame[String] S)
	return(Frame[Unknown] dict)
	{
	[ID,M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
	dstr = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
	dcodes = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[1]");
	frequencies = table(seq(1,nrow(dstr)),as.matrix(dcodes)) %*% table(ID, 1);
	dict = cbind(dstr, as.frame(frequencies));
	}

	damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, double bound, Boolean is_verbose)
	return(double dl_distance) {

	dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1);
	dl_matrix[length(A) + 1, length(B) + 1] = -1;
	dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1);
	dl_matrix[2, 1] = 1;

	for (j in 2:length(B) + 1) {
	cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1]))
	dl_matrix[2, j] = min(min(
	dl_matrix[2, j - 1] + 1,
	dl_matrix[1, j] + 1),
	dl_matrix[1, j - 1] + cost);
	}

	i = 2;
	break_condition = FALSE;
	while (i < length(A) + 1 & !break_condition) {
	i += 1;

	dl_matrix[i, 1] = i - 1;
	cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1]))
	dl_matrix[i, 2] = min(min(
	dl_matrix[i - 1, 2] + 1,
	dl_matrix[i, 1] + 1),
	dl_matrix[i - 1, 1] + cost);

	for (j in 3:length(B) + 1) {
	cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1]))
	if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) {
	dl_matrix[i, j] = min(min(
	dl_matrix[i, j - 1] + 1,
	dl_matrix[i - 1, j] + 1), min(
	dl_matrix[i - 1, j - 1] + cost,
	dl_matrix[i - 2, j - 2] + 1));
	} else {
	dl_matrix[i, j] = min(min(
	dl_matrix[i, j - 1] + 1,
	dl_matrix[i - 1, j] + 1),
	dl_matrix[i - 1, j - 1] + cost);
	}
	}

	break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound;
	}

	if (is_verbose){
	print("dl distance matrix:")
	print(toString(dl_matrix));
	}

	dl_distance = as.scalar(dl_matrix[length(A) + 1, length(B) + 1]);
	}