blob: 5bd1fcd69120caf6e22d578e290510a6fc091b73 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import static org.apache.iceberg.util.BinaryUtil.truncateBinary;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMax;
import static org.apache.iceberg.util.BinaryUtil.truncateBinaryMin;
import static org.apache.iceberg.util.UnicodeUtil.truncateStringMax;
import static org.apache.iceberg.util.UnicodeUtil.truncateStringMin;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import java.nio.ByteBuffer;
import java.util.Comparator;
import org.apache.iceberg.expressions.Literal;
import org.junit.jupiter.api.Test;
@SuppressWarnings("checkstyle:LocalVariableName")
public class TestMetricsTruncation {
@Test
public void testTruncateBinary() {
ByteBuffer original = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});
ByteBuffer emptyByteBuffer = ByteBuffer.allocate(0);
Comparator<ByteBuffer> cmp = Literal.of(original).comparator();
assertThat(cmp.compare(truncateBinary(original, 0), emptyByteBuffer))
.as("Truncating to a length of zero should return an empty ByteBuffer")
.isEqualTo(0);
assertThat(truncateBinary(original, original.remaining()))
.as("Truncating to the original buffer's remaining size should return the original buffer")
.isEqualTo(original);
assertThat(truncateBinary(original, 16))
.as(
"Truncating with a length greater than the input's remaining size should return the input")
.isEqualTo(original);
ByteBuffer truncated = truncateBinary(original, 2);
assertThat(truncated.remaining())
.as(
"Truncating with a length less than the input's remaining size should truncate properly")
.isEqualTo(2);
assertThat(truncated.position())
.as(
"Truncating with a length less than the input's remaining size should truncate properly")
.isEqualTo(0);
assertThat(original.remaining())
.as("Truncating should not modify the input buffer")
.isEqualTo(4);
assertThat(original.position())
.as("Truncating should not modify the input buffer")
.isEqualTo(0);
assertThatThrownBy(() -> truncateBinary(original, -1))
.isInstanceOf(IllegalArgumentException.class)
.hasMessage("Truncate length should be non-negative");
}
@Test
public void testTruncateBinaryMin() {
ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});
// Output of test1 when truncated to 2 bytes
ByteBuffer test1_2_expected = ByteBuffer.wrap(new byte[] {1, 1});
ByteBuffer test2 = ByteBuffer.wrap(new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, 2});
ByteBuffer test2_2 = ByteBuffer.wrap(new byte[] {(byte) 0xFF, (byte) 0xFF});
Comparator<ByteBuffer> cmp = Literal.of(test1).comparator();
assertThat(cmp.compare(truncateBinaryMin(Literal.of(test1), 2).value(), test1))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateBinaryMin(Literal.of(test1), 2).value(), test1_2_expected))
.as("Output must have the first two bytes of the input")
.isEqualTo(0);
assertThat(cmp.compare(truncateBinaryMin(Literal.of(test1), 5).value(), test1))
.as("No truncation required as truncate length is greater than the input size")
.isEqualTo(0);
assertThat(cmp.compare(truncateBinaryMin(Literal.of(test2), 2).value(), test2))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateBinaryMin(Literal.of(test2), 2).value(), test2_2))
.as(
"Output must have the first two bytes of the input. A lower bound exists "
+ "even though the first two bytes are the max value")
.isEqualTo(0);
}
@Test
public void testTruncateBinaryMax() {
ByteBuffer test1 = ByteBuffer.wrap(new byte[] {1, 1, 2});
ByteBuffer test2 = ByteBuffer.wrap(new byte[] {1, 1, (byte) 0xFF, 2});
ByteBuffer test3 = ByteBuffer.wrap(new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, 2});
ByteBuffer test4 = ByteBuffer.wrap(new byte[] {1, 1, 0});
ByteBuffer expectedOutput = ByteBuffer.wrap(new byte[] {1, 2});
Comparator<ByteBuffer> cmp = Literal.of(test1).comparator();
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test1), 2).value(), test1))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test1), 2).value(), expectedOutput))
.as("Output must have two bytes and the second byte of the input must be incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test2), 2).value(), test2))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test2), 3).value(), expectedOutput))
.as(
"Since the third byte is already the max value, output must have two bytes "
+ "with the second byte incremented ")
.isEqualTo(0);
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test3), 5).value(), test3))
.as("No truncation required as truncate length is greater than the input size")
.isEqualTo(0);
assertThat(truncateBinaryMax(Literal.of(test3), 2))
.as("An upper bound doesn't exist since the first two bytes are the max value")
.isNull();
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test4), 2).value(), test4))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateBinaryMax(Literal.of(test4), 2).value(), expectedOutput))
.as(
"Since a shorter sequence is considered smaller, output must have two bytes "
+ "and the second byte of the input must be incremented")
.isEqualTo(0);
}
@SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters")
@Test
public void testTruncateStringMin() {
String test1 = "イロハニホヘト";
// Output of test1 when truncated to 2 unicode characters
String test1_2_expected = "イロ";
String test1_3_expected = "イロハ";
String test2 = "щщаεはчωいにπάほхεろへσκζ";
String test2_7_expected = "щщаεはчω";
// U+FFFF is max 3 byte UTF-8 character
String test3 = "\uFFFF\uFFFF";
// test4 consists of 2 4 byte UTF-8 characters
String test4 = "\uD800\uDC00\uD800\uDC00";
String test4_1_expected = "\uD800\uDC00";
Comparator<CharSequence> cmp = Literal.of(test1).comparator();
assertThat(cmp.compare(truncateStringMin(Literal.of(test1), 3).value(), test1))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test1), 8).value(), test1))
.as("No truncation required as truncate length is greater than the input size")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test1), 2).value(), test1_2_expected))
.as("Output must have the first two characters of the input")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test1), 3).value(), test1_3_expected))
.as("Output must have the first three characters of the input")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test2), 16).value(), test2))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test2), 7).value(), test2_7_expected))
.as("Output must have the first seven characters of the input")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test3), 2).value(), test3))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test3), 2).value(), test3))
.as("No truncation required as truncate length is equal to the input size")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4))
.as("Truncated lower bound should be lower than or equal to the actual lower bound")
.isLessThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMin(Literal.of(test4), 1).value(), test4_1_expected))
.as("Output must have the first 4 byte UTF-8 character of the input")
.isEqualTo(0);
}
@SuppressWarnings("checkstyle:AvoidEscapedUnicodeCharacters")
@Test
public void testTruncateStringMax() {
String test1 = "イロハニホヘト";
// Output of test1 when truncated to 2 unicode characters
String test1_2_expected = "イヮ";
String test1_3_expected = "イロバ";
String test2 = "щщаεはчωいにπάほхεろへσκζ";
String test2_7_expected = "щщаεはчϊ";
String test3 = "aनि\uFFFF\uFFFF";
String test3_3_expected = "aनी";
// U+FFFF is max 3 byte UTF-8 character
String test4 = "\uFFFF\uFFFF";
String test4_1_expected = "\uD800\uDC00";
// test5 consists of 2 4 byte max UTF-8 characters
String test5 = "\uDBFF\uDFFF\uDBFF\uDFFF";
String test6 = "\uD800\uDFFF\uD800\uDFFF";
// Increment the previous character
String test6_2_expected = "\uD801\uDC00";
String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
String test7_1_expected = "\uD83D\uDE03";
Comparator<CharSequence> cmp = Literal.of(test1).comparator();
assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 4).value(), test1))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 7).value(), test1))
.as("No truncation required as truncate length is equal to the input size")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 2).value(), test1_2_expected))
.as(
"Output must have two characters and the second character of the input must "
+ "be incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 3).value(), test1_3_expected))
.as(
"Output must have three characters and the third character of the input must "
+ "be incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 8).value(), test1))
.as("No truncation required as truncate length is greater than the input size")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test2), 8).value(), test2))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test2), 7).value(), test2_7_expected))
.as(
"Output must have seven characters and the seventh character of the input must be incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test3), 3).value(), test3))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test3), 3).value(), test3_3_expected))
.as(
"Output must have three characters and the third character of the input must "
+ "be incremented. The second perceivable character in this string is actually a glyph. It consists of "
+ "two unicode characters")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test4), 1).value(), test4))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test4), 1).value(), test4_1_expected))
.as(
"Output must have one character. Since the first character is the max 3 byte "
+ "UTF-8 character, it should be incremented to the lowest 4 byte UTF-8 character")
.isEqualTo(0);
assertThat(truncateStringMax(Literal.of(test5), 1))
.as("An upper bound doesn't exist since the first two characters are max UTF-8 characters")
.isNull();
assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 2).value(), test6))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected))
.as(
"Test 4 byte UTF-8 character increment. Output must have one character with "
+ "the first character incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7))
.as("Truncated upper bound should be greater than or equal to the actual upper bound")
.isGreaterThanOrEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected))
.as(
"Test input with multiple 4 byte UTF-8 character where the second unicode character should be incremented")
.isEqualTo(0);
assertThat(cmp.compare(truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected))
.as(
"Test input with multiple 4 byte UTF-8 character where the first unicode character should be incremented")
.isEqualTo(0);
}
}