blob: c3e3d8574941b00301616824eccfd9a3e3d7daec [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.internal.column.columnindex;
import static org.apache.parquet.schema.OriginalType.BSON;
import static org.apache.parquet.schema.OriginalType.DECIMAL;
import static org.apache.parquet.schema.OriginalType.ENUM;
import static org.apache.parquet.schema.OriginalType.INTERVAL;
import static org.apache.parquet.schema.OriginalType.JSON;
import static org.apache.parquet.schema.OriginalType.UTF8;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.Comparator;
import java.util.Random;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveStringifier;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Tests for {@link BinaryTruncator}
*/
public class TestBinaryTruncator {
private static final Logger LOG = LoggerFactory.getLogger(TestBinaryTruncator.class);
private static final PrimitiveStringifier HEXA_STRINGIFIER = Types.required(BINARY)
.named("dummy_type").stringifier();
private static final Random RANDOM = new Random(42);
private static final CharsetDecoder UTF8_DECODER = StandardCharsets.UTF_8.newDecoder();
static {
UTF8_DECODER.onMalformedInput(CodingErrorAction.REPORT);
UTF8_DECODER.onUnmappableCharacter(CodingErrorAction.REPORT);
}
// The maximum values in UTF-8 for the 1, 2, 3 and 4 bytes representations
private static final String UTF8_1BYTE_MAX_CHAR = "\u007F";
private static final String UTF8_2BYTES_MAX_CHAR = "\u07FF";
private static final String UTF8_3BYTES_MAX_CHAR = "\uFFFF";
private static final String UTF8_4BYTES_MAX_CHAR = "\uDBFF\uDFFF";
@Test
public void testNonStringTruncate() {
BinaryTruncator truncator = BinaryTruncator
.getTruncator(Types.required(BINARY).as(DECIMAL).precision(10).scale(2).named("test_binary_decimal"));
assertEquals(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA),
truncator.truncateMin(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA), 2));
assertEquals(binary(0x01, 0x02, 0x03, 0x04, 0x05, 0x06),
truncator.truncateMax(binary(0x01, 0x02, 0x03, 0x04, 0x05, 0x06), 2));
}
@Test
public void testContractNonStringTypes() {
testTruncator(
Types.required(FIXED_LEN_BYTE_ARRAY).length(8).as(DECIMAL).precision(18).scale(4).named("test_fixed_decimal"),
false);
testTruncator(Types.required(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("test_fixed_interval"), false);
testTruncator(Types.required(BINARY).as(DECIMAL).precision(10).scale(2).named("test_binary_decimal"), false);
testTruncator(Types.required(INT96).named("test_int96"), false);
}
@Test
public void testStringTruncate() {
BinaryTruncator truncator = BinaryTruncator.getTruncator(Types.required(BINARY).as(UTF8).named("test_utf8"));
// Truncate 1 byte characters
assertEquals(Binary.fromString("abc"), truncator.truncateMin(Binary.fromString("abcdef"), 3));
assertEquals(Binary.fromString("abd"), truncator.truncateMax(Binary.fromString("abcdef"), 3));
// Truncate 1-2 bytes characters; the target length is "inside" a UTF-8 character
assertEquals(Binary.fromString("árvízt"), truncator.truncateMin(Binary.fromString("árvíztűrő"), 9));
assertEquals(Binary.fromString("árvízu"), truncator.truncateMax(Binary.fromString("árvíztűrő"), 9));
// Truncate highest UTF-8 values -> unable to increment
assertEquals(
Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR),
truncator.truncateMin(Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR
+ UTF8_3BYTES_MAX_CHAR
+ UTF8_4BYTES_MAX_CHAR),
5));
assertEquals(
Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR
+ UTF8_3BYTES_MAX_CHAR
+ UTF8_4BYTES_MAX_CHAR),
truncator.truncateMax(Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR
+ UTF8_3BYTES_MAX_CHAR
+ UTF8_4BYTES_MAX_CHAR),
5));
// Truncate highest UTF-8 values at the end -> increment the first possible character
assertEquals(
Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR
+ "b"
+ UTF8_3BYTES_MAX_CHAR),
truncator.truncateMax(Binary.fromString(
UTF8_1BYTE_MAX_CHAR
+ UTF8_2BYTES_MAX_CHAR
+ "a"
+ UTF8_3BYTES_MAX_CHAR
+ UTF8_4BYTES_MAX_CHAR),
10));
// Truncate invalid UTF-8 values -> truncate without validity check
assertEquals(binary(0xFF, 0xFE, 0xFD), truncator.truncateMin(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA), 3));
assertEquals(binary(0xFF, 0xFE, 0xFE), truncator.truncateMax(binary(0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA), 3));
assertEquals(binary(0xFF, 0xFE, 0xFE, 0x00, 0x00), truncator.truncateMax(binary(0xFF, 0xFE, 0xFD, 0xFF, 0xFF, 0xFF), 5));
}
@Test
public void testContractStringTypes() {
testTruncator(Types.required(BINARY).named("test_binary"), true);
testTruncator(Types.required(BINARY).as(UTF8).named("test_utf8"), true);
testTruncator(Types.required(BINARY).as(ENUM).named("test_enum"), true);
testTruncator(Types.required(BINARY).as(JSON).named("test_json"), true);
testTruncator(Types.required(BINARY).as(BSON).named("test_bson"), true);
testTruncator(Types.required(FIXED_LEN_BYTE_ARRAY).length(5).named("test_fixed"), true);
}
private void testTruncator(PrimitiveType type, boolean strict) {
BinaryTruncator truncator = BinaryTruncator.getTruncator(type);
Comparator<Binary> comparator = type.comparator();
checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa"), strict, strict);
checkContract(truncator, comparator, Binary.fromString("árvíztűrő tükörfúrógép"), strict, strict);
checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa" + UTF8_3BYTES_MAX_CHAR), strict, strict);
checkContract(truncator, comparator, Binary.fromString("a" + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR), strict,
strict);
checkContract(truncator, comparator,
Binary.fromConstantByteArray(new byte[] { (byte) 0xFE, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, (byte) 0xFF }), strict,
strict);
// Edge case: zero length -> unable to truncate
checkContract(truncator, comparator, Binary.fromString(""), false, false);
// Edge case: containing only UTF-8 max characters -> unable to truncate for max
checkContract(truncator, comparator, Binary.fromString(
UTF8_1BYTE_MAX_CHAR +
UTF8_4BYTES_MAX_CHAR +
UTF8_3BYTES_MAX_CHAR +
UTF8_4BYTES_MAX_CHAR +
UTF8_2BYTES_MAX_CHAR +
UTF8_3BYTES_MAX_CHAR +
UTF8_3BYTES_MAX_CHAR +
UTF8_1BYTE_MAX_CHAR +
UTF8_2BYTES_MAX_CHAR +
UTF8_3BYTES_MAX_CHAR +
UTF8_4BYTES_MAX_CHAR),
strict, false);
// Edge case: non-UTF-8; max bytes -> unable to truncate for max
checkContract(
truncator, comparator,
binary(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF),
strict, false);
}
// Checks the contract of truncator
// strict means actual truncation is required and the truncated value is a valid UTF-8 string
private void checkContract(BinaryTruncator truncator, Comparator<Binary> comparator, Binary value, boolean strictMin,
boolean strictMax) {
int length = value.length();
// Edge cases: returning the original value if no truncation is required
assertSame(value, truncator.truncateMin(value, length));
assertSame(value, truncator.truncateMax(value, length));
assertSame(value, truncator.truncateMin(value, random(length + 1, length * 2 + 1)));
assertSame(value, truncator.truncateMax(value, random(length + 1, length * 2 + 1)));
if (length > 1) {
checkMinContract(truncator, comparator, value, length - 1, strictMin);
checkMaxContract(truncator, comparator, value, length - 1, strictMax);
checkMinContract(truncator, comparator, value, random(1, length - 1), strictMin);
checkMaxContract(truncator, comparator, value, random(1, length - 1), strictMax);
}
// Edge case: possible to truncate min value to 0 length if original value is not empty
checkMinContract(truncator, comparator, value, 0, strictMin);
// Edge case: impossible to truncate max value to 0 length -> returning the original value
assertSame(value, truncator.truncateMax(value, 0));
}
private void checkMinContract(BinaryTruncator truncator, Comparator<Binary> comparator, Binary value, int length,
boolean strict) {
Binary truncated = truncator.truncateMin(value, length);
LOG.debug("\"{}\" --truncMin({})--> \"{}\" [{}]", value.toStringUsingUTF8(), length, truncated.toStringUsingUTF8(),
HEXA_STRINGIFIER.stringify(truncated));
assertTrue("truncatedMin(value) should be <= than value", comparator.compare(truncated, value) <= 0);
assertFalse("length of truncateMin(value) should not be > than the length of value",
truncated.length() > value.length());
if (isValidUtf8(value)) {
checkValidUtf8(truncated);
}
if (strict) {
assertTrue("length of truncateMin(value) ahould be < than the length of value",
truncated.length() < value.length());
}
}
private void checkMaxContract(BinaryTruncator truncator, Comparator<Binary> comparator, Binary value, int length,
boolean strict) {
Binary truncated = truncator.truncateMax(value, length);
LOG.debug("\"{}\" --truncMax({})--> \"{}\" [{}]", value.toStringUsingUTF8(), length, truncated.toStringUsingUTF8(),
HEXA_STRINGIFIER.stringify(truncated));
assertTrue("truncatedMax(value) should be >= than value", comparator.compare(truncated, value) >= 0);
assertFalse("length of truncateMax(value) should not be > than the length of value",
truncated.length() > value.length());
if (isValidUtf8(value)) {
checkValidUtf8(truncated);
}
if (strict) {
assertTrue("length of truncateMax(value) ahould be < than the length of value",
truncated.length() < value.length());
}
}
private static boolean isValidUtf8(Binary binary) {
try {
UTF8_DECODER.decode(binary.toByteBuffer());
return true;
} catch (CharacterCodingException e) {
return false;
}
}
private static void checkValidUtf8(Binary binary) {
try {
UTF8_DECODER.decode(binary.toByteBuffer());
} catch (CharacterCodingException e) {
throw new AssertionError("Truncated value should be a valid UTF-8 string", e);
}
}
private static int random(int min, int max) {
return RANDOM.nextInt(max - min + 1) + min;
}
private static Binary binary(int... unsignedBytes) {
byte[] byteArray = new byte[unsignedBytes.length];
for (int i = 0, n = byteArray.length; i < n; ++i) {
int b = unsignedBytes[i];
assert (0xFFFFFF00 & b) == 0;
byteArray[i] = (byte) b;
}
return Binary.fromConstantByteArray(byteArray);
}
}