StringComparators: No need to convert to UTF-8 for lexicographic comparison. (#11171)
Lexicographic ordering of UTF-8 byte sequences and in-memory UTF-16
strings are equivalent. So, we can skip the (expensive) conversion and
get an equivalent result. Thank you, Unicode!
diff --git a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
index ee2e779..58e228a 100644
--- a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
+++ b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java
@@ -21,9 +21,7 @@
import com.google.common.collect.Ordering;
import com.google.common.primitives.Ints;
-import com.google.common.primitives.UnsignedBytes;
import org.apache.druid.common.guava.GuavaUtils;
-import org.apache.druid.java.util.common.StringUtils;
import org.apache.maven.artifact.versioning.DefaultArtifactVersion;
import java.math.BigDecimal;
@@ -51,20 +49,13 @@
public static class LexicographicComparator extends StringComparator
{
- private static final Ordering<String> ORDERING = Ordering.from(new Comparator<String>()
- {
- @Override
- public int compare(String s, String s2)
- {
- return UnsignedBytes.lexicographicalComparator().compare(
- StringUtils.toUtf8(s), StringUtils.toUtf8(s2));
- }
- }).nullsFirst();
-
+ // Equivalent to comparing UTF-8 encoded strings as byte arrays.
+ private static final Ordering<String> ORDERING = Ordering.from(String::compareTo).nullsFirst();
+
@Override
public int compare(String s, String s2)
{
- // Avoid conversion to bytes for equal references
+ // Avoid comparisons for equal references
// Assuming we mostly compare different strings, checking s.equals(s2) will only make the comparison slower.
//noinspection StringEquality
if (s == s2) {