blob: 42660d9cdec3dee5e42a54d63f121d538c3f944b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.TestInfo;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestUnicode implements TestConf {
Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
+ File.separator + "tmp"));
FileSystem fs;
Path testFilePath;
private static Stream<Arguments> data() {
ArrayList<Arguments> data = new ArrayList<>();
for (int j = 0; j < 2; j++) {
for (int i = 1; i <= 5; i++) {
data.add(Arguments.of(j == 0 ? "char" : "varchar", i, true));
}
}
return data.stream();
}
static final String[] utf8strs = new String[] {
// Character.UnicodeBlock GREEK (2 bytes)
"\u03b1\u03b2\u03b3", "\u03b1\u03b2", "\u03b1\u03b2\u03b3\u03b4",
"\u03b1\u03b2\u03b3\u03b4",
// Character.UnicodeBlock MALAYALAM (3 bytes)
"\u0d06\u0d30\u0d3e", "\u0d0e\u0d28\u0d4d\u0d24\u0d3e", "\u0d13\u0d7c\u0d15\u0d4d",
// Unicode emoji (4 bytes)
"\u270f\ufe0f\ud83d\udcdd\u270f\ufe0f", "\ud83c\udf3b\ud83d\udc1d\ud83c\udf6f",
"\ud83c\udf7a\ud83e\udd43\ud83c\udf77" };
@BeforeEach
public void openFileSystem(TestInfo testInfo) throws Exception {
fs = FileSystem.getLocal(conf);
testFilePath = new Path(workDir, "TestOrcFile." +
testInfo.getTestMethod().get().getName() + ".orc");
fs.delete(testFilePath, false);
}
@ParameterizedTest
@MethodSource("data")
public void testUtf8(String type, int maxLength, boolean hasRTrim) throws Exception {
if (type.equals("varchar")) {
testVarChar(maxLength);
} else {
testChar(maxLength, hasRTrim);
}
}
// copied from HiveBaseChar
public static String enforceMaxLength(String val, int maxLength) {
if (val == null) {
return null;
}
String value = val;
if (maxLength > 0) {
int valLength = val.codePointCount(0, val.length());
if (valLength > maxLength) {
// Truncate the excess chars to fit the character length.
// Also make sure we take supplementary chars into account.
value = val.substring(0, val.offsetByCodePoints(0, maxLength));
}
}
return value;
}
// copied from HiveBaseChar
public static String getPaddedValue(String val, int maxLength, boolean rtrim) {
if (val == null) {
return null;
}
if (maxLength < 0) {
return val;
}
int valLength = val.codePointCount(0, val.length());
if (valLength > maxLength) {
return enforceMaxLength(val, maxLength);
}
if (maxLength > valLength && rtrim == false) {
// Make sure we pad the right amount of spaces; valLength is in terms of code points,
// while StringUtils.rpad() is based on the number of java chars.
int padLength = val.length() + (maxLength - valLength);
val = StringUtils.rightPad(val, padLength);
}
return val;
}
public void testChar(int maxLength, boolean hasRTrim) throws Exception {
// char(n)
TypeDescription schema = TypeDescription.createChar().withMaxLength(maxLength);
String[] expected = new String[utf8strs.length];
for (int i = 0; i < utf8strs.length; i++) {
expected[i] = getPaddedValue(utf8strs[i], maxLength, hasRTrim);
}
verifyWrittenStrings(schema, utf8strs, expected, maxLength);
}
public void testVarChar(int maxLength) throws Exception {
// char(n)
TypeDescription schema = TypeDescription.createVarchar().withMaxLength(maxLength);
String[] expected = new String[utf8strs.length];
for (int i = 0; i < utf8strs.length; i++) {
expected[i] = enforceMaxLength(utf8strs[i], maxLength);
}
verifyWrittenStrings(schema, utf8strs, expected, maxLength);
}
public void verifyWrittenStrings(TypeDescription schema, String[] inputs, String[] expected, int maxLength)
throws Exception {
Writer writer =
OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)
.compress(CompressionKind.NONE).bufferSize(10000));
VectorizedRowBatch batch = schema.createRowBatch();
BytesColumnVector col = (BytesColumnVector) batch.cols[0];
for (int i = 0; i < inputs.length; i++) {
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
col.setVal(batch.size++, inputs[i].getBytes(StandardCharsets.UTF_8));
}
writer.addRowBatch(batch);
writer.close();
Reader reader =
OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
col = (BytesColumnVector) batch.cols[0];
int idx = 0;
while (rows.nextBatch(batch)) {
for (int r = 0; r < batch.size; ++r) {
assertEquals(expected[idx], toString(col, r),
String.format("test for %s:%d", schema, maxLength));
idx++;
}
}
fs.delete(testFilePath, false);
}
static String toString(BytesColumnVector vector, int row) {
if (vector.isRepeating) {
row = 0;
}
if (!vector.noNulls && vector.isNull[row]) {
return null;
}
return new String(vector.vector[row], vector.start[row], vector.length[row],
StandardCharsets.UTF_8);
}
}