blob: ac95d40c59db7ef68c37d8f06c39b7fb67ee9e87 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.util;
import static org.apache.beam.vendor.guava.v20_0.com.google.common.base.Preconditions.checkNotNull;
import java.util.ArrayList;
import java.util.List;
/** Utilities for working with JSON and other human-readable string formats. */
public class StringUtils {
/**
* Converts the given array of bytes into a legal JSON string.
*
* <p>Uses a simple strategy of converting each byte to a single char, except for non-printable
* chars, non-ASCII chars, and '%', '\', and '"', which are encoded as three chars in '%xx'
* format, where 'xx' is the hexadecimal encoding of the byte.
*/
public static String byteArrayToJsonString(byte[] bytes) {
StringBuilder sb = new StringBuilder(bytes.length * 2);
for (byte b : bytes) {
if (b >= 32 && b < 127) {
// A printable ascii character.
char c = (char) b;
if (c != '%' && c != '\\' && c != '\"') {
// Not an escape prefix or special character, either.
// Send through unchanged.
sb.append(c);
continue;
}
}
// Send through escaped. Use '%xx' format.
sb.append(String.format("%%%02x", b));
}
return sb.toString();
}
/**
* Converts the given string, encoded using {@link #byteArrayToJsonString}, into a byte array.
*
* @throws IllegalArgumentException if the argument string is not legal
*/
public static byte[] jsonStringToByteArray(String string) {
List<Byte> bytes = new ArrayList<>();
for (int i = 0; i < string.length(); ) {
char c = string.charAt(i);
Byte b;
if (c == '%') {
// Escaped. Expect '%xx' format.
try {
b = (byte) Integer.parseInt(string.substring(i + 1, i + 3), 16);
} catch (IndexOutOfBoundsException | NumberFormatException exn) {
throw new IllegalArgumentException(
"not in legal encoded format; "
+ "substring ["
+ i
+ ".."
+ (i + 2)
+ "] not in format \"%xx\"",
exn);
}
i += 3;
} else {
// Send through unchanged.
b = (byte) c;
i++;
}
bytes.add(b);
}
byte[] byteArray = new byte[bytes.size()];
int i = 0;
for (Byte b : bytes) {
byteArray[i++] = b;
}
return byteArray;
}
/**
* Calculate the Levenshtein distance between two strings.
*
* <p>The Levenshtein distance between two words is the minimum number of single-character edits
* (i.e. insertions, deletions or substitutions) required to change one string into the other.
*/
public static int getLevenshteinDistance(final String s, final String t) {
checkNotNull(s);
checkNotNull(t);
// base cases
if (s.equals(t)) {
return 0;
}
if (s.length() == 0) {
return t.length();
}
if (t.length() == 0) {
return s.length();
}
// create two work arrays to store integer distances
final int[] v0 = new int[t.length() + 1];
final int[] v1 = new int[t.length() + 1];
// initialize v0 (the previous row of distances)
// this row is A[0][i]: edit distance for an empty s
// the distance is just the number of characters to delete from t
for (int i = 0; i < v0.length; i++) {
v0[i] = i;
}
for (int i = 0; i < s.length(); i++) {
// calculate v1 (current row distances) from the previous row v0
// first element of v1 is A[i+1][0]
// edit distance is delete (i+1) chars from s to match empty t
v1[0] = i + 1;
// use formula to fill in the rest of the row
for (int j = 0; j < t.length(); j++) {
int cost = (s.charAt(i) == t.charAt(j)) ? 0 : 1;
v1[j + 1] = Math.min(Math.min(v1[j] + 1, v0[j + 1] + 1), v0[j] + cost);
}
// copy v1 (current row) to v0 (previous row) for next iteration
System.arraycopy(v1, 0, v0, 0, v0.length);
}
return v1[t.length()];
}
}