java/core/src/test/org/apache/orc/TestUnicode.java - orc - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.orc;

 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.TestInfo;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.Arguments;
 import org.junit.jupiter.params.provider.MethodSource;

 import java.io.File;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.stream.Stream;

 import static org.junit.jupiter.api.Assertions.assertEquals;

 public class TestUnicode implements TestConf {
   Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
       + File.separator + "tmp"));

   FileSystem fs;
   Path testFilePath;

   private static Stream<Arguments> data() {
     ArrayList<Arguments> data = new ArrayList<>();
     for (int j = 0; j < 2; j++) {
       for (int i = 1; i <= 5; i++) {
         data.add(Arguments.of(j == 0 ? "char" : "varchar", i, true));
       }
     }
     return data.stream();
   }

   static final String[] utf8strs = new String[] {
       // Character.UnicodeBlock GREEK (2 bytes)
       "\u03b1\u03b2\u03b3", "\u03b1\u03b2", "\u03b1\u03b2\u03b3\u03b4",
       "\u03b1\u03b2\u03b3\u03b4",
       // Character.UnicodeBlock MALAYALAM (3 bytes)
       "\u0d06\u0d30\u0d3e", "\u0d0e\u0d28\u0d4d\u0d24\u0d3e", "\u0d13\u0d7c\u0d15\u0d4d",
       // Unicode emoji (4 bytes)
       "\u270f\ufe0f\ud83d\udcdd\u270f\ufe0f", "\ud83c\udf3b\ud83d\udc1d\ud83c\udf6f",
       "\ud83c\udf7a\ud83e\udd43\ud83c\udf77" };

   @BeforeEach
   public void openFileSystem(TestInfo testInfo) throws Exception {
     fs = FileSystem.getLocal(conf);
     testFilePath = new Path(workDir, "TestOrcFile." +
         testInfo.getTestMethod().get().getName() + ".orc");
     fs.delete(testFilePath, false);
   }

   @ParameterizedTest
   @MethodSource("data")
   public void testUtf8(String type, int maxLength, boolean hasRTrim) throws Exception {
     if (type.equals("varchar")) {
       testVarChar(maxLength);
     } else {
       testChar(maxLength, hasRTrim);
     }
   }

   // copied from HiveBaseChar
   public static String enforceMaxLength(String val, int maxLength) {
     if (val == null) {
       return null;
     }
     String value = val;

     if (maxLength > 0) {
       int valLength = val.codePointCount(0, val.length());
       if (valLength > maxLength) {
         // Truncate the excess chars to fit the character length.
         // Also make sure we take supplementary chars into account.
         value = val.substring(0, val.offsetByCodePoints(0, maxLength));
       }
     }
     return value;
   }

   // copied from HiveBaseChar
   public static String getPaddedValue(String val, int maxLength, boolean rtrim) {
     if (val == null) {
       return null;
     }
     if (maxLength < 0) {
       return val;
     }

     int valLength = val.codePointCount(0, val.length());
     if (valLength > maxLength) {
       return enforceMaxLength(val, maxLength);
     }

     if (maxLength > valLength && rtrim == false) {
       // Make sure we pad the right amount of spaces; valLength is in terms of code points,
       // while StringUtils.rpad() is based on the number of java chars.
       int padLength = val.length() + (maxLength - valLength);
       val = StringUtils.rightPad(val, padLength);
     }
     return val;
   }

   public void testChar(int maxLength, boolean hasRTrim) throws Exception {
     // char(n)
     TypeDescription schema = TypeDescription.createChar().withMaxLength(maxLength);
     String[] expected = new String[utf8strs.length];
     for (int i = 0; i < utf8strs.length; i++) {
       expected[i] = getPaddedValue(utf8strs[i], maxLength, hasRTrim);
     }
     verifyWrittenStrings(schema, utf8strs, expected, maxLength);
   }

   public void testVarChar(int maxLength) throws Exception {
     // char(n)
     TypeDescription schema = TypeDescription.createVarchar().withMaxLength(maxLength);
     String[] expected = new String[utf8strs.length];
     for (int i = 0; i < utf8strs.length; i++) {
       expected[i] = enforceMaxLength(utf8strs[i], maxLength);
     }
     verifyWrittenStrings(schema, utf8strs, expected, maxLength);
   }

   public void verifyWrittenStrings(TypeDescription schema, String[] inputs, String[] expected, int maxLength)
       throws Exception {
     Writer writer =
         OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)
             .compress(CompressionKind.NONE).bufferSize(10000));
     VectorizedRowBatch batch = schema.createRowBatch();
     BytesColumnVector col = (BytesColumnVector) batch.cols[0];
     for (int i = 0; i < inputs.length; i++) {
       if (batch.size == batch.getMaxSize()) {
         writer.addRowBatch(batch);
         batch.reset();
       }
       col.setVal(batch.size++, inputs[i].getBytes(StandardCharsets.UTF_8));
     }
     writer.addRowBatch(batch);
     writer.close();

     Reader reader =
         OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
     RecordReader rows = reader.rows();
     batch = reader.getSchema().createRowBatch();
     col = (BytesColumnVector) batch.cols[0];
     int idx = 0;
     while (rows.nextBatch(batch)) {
       for (int r = 0; r < batch.size; ++r) {
         assertEquals(expected[idx], toString(col, r),
             String.format("test for %s:%d", schema, maxLength));
         idx++;
       }
     }
     fs.delete(testFilePath, false);
   }

   static String toString(BytesColumnVector vector, int row) {
     if (vector.isRepeating) {
       row = 0;
     }
     if (!vector.noNulls && vector.isNull[row]) {
       return null;
     }
     return new String(vector.vector[row], vector.start[row], vector.length[row],
         StandardCharsets.UTF_8);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.orc;

	import org.apache.commons.lang3.StringUtils;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
	import org.junit.jupiter.api.BeforeEach;
	import org.junit.jupiter.api.TestInfo;
	import org.junit.jupiter.params.ParameterizedTest;
	import org.junit.jupiter.params.provider.Arguments;
	import org.junit.jupiter.params.provider.MethodSource;

	import java.io.File;
	import java.nio.charset.StandardCharsets;
	import java.util.ArrayList;
	import java.util.stream.Stream;

	import static org.junit.jupiter.api.Assertions.assertEquals;

	public class TestUnicode implements TestConf {
	Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test"
	+ File.separator + "tmp"));

	FileSystem fs;
	Path testFilePath;

	private static Stream<Arguments> data() {
	ArrayList<Arguments> data = new ArrayList<>();
	for (int j = 0; j < 2; j++) {
	for (int i = 1; i <= 5; i++) {
	data.add(Arguments.of(j == 0 ? "char" : "varchar", i, true));
	}
	}
	return data.stream();
	}

	static final String[] utf8strs = new String[] {
	// Character.UnicodeBlock GREEK (2 bytes)
	"\u03b1\u03b2\u03b3", "\u03b1\u03b2", "\u03b1\u03b2\u03b3\u03b4",
	"\u03b1\u03b2\u03b3\u03b4",
	// Character.UnicodeBlock MALAYALAM (3 bytes)
	"\u0d06\u0d30\u0d3e", "\u0d0e\u0d28\u0d4d\u0d24\u0d3e", "\u0d13\u0d7c\u0d15\u0d4d",
	// Unicode emoji (4 bytes)
	"\u270f\ufe0f\ud83d\udcdd\u270f\ufe0f", "\ud83c\udf3b\ud83d\udc1d\ud83c\udf6f",
	"\ud83c\udf7a\ud83e\udd43\ud83c\udf77" };

	@BeforeEach
	public void openFileSystem(TestInfo testInfo) throws Exception {
	fs = FileSystem.getLocal(conf);
	testFilePath = new Path(workDir, "TestOrcFile." +
	testInfo.getTestMethod().get().getName() + ".orc");
	fs.delete(testFilePath, false);
	}

	@ParameterizedTest
	@MethodSource("data")
	public void testUtf8(String type, int maxLength, boolean hasRTrim) throws Exception {
	if (type.equals("varchar")) {
	testVarChar(maxLength);
	} else {
	testChar(maxLength, hasRTrim);
	}
	}

	// copied from HiveBaseChar
	public static String enforceMaxLength(String val, int maxLength) {
	if (val == null) {
	return null;
	}
	String value = val;

	if (maxLength > 0) {
	int valLength = val.codePointCount(0, val.length());
	if (valLength > maxLength) {
	// Truncate the excess chars to fit the character length.
	// Also make sure we take supplementary chars into account.
	value = val.substring(0, val.offsetByCodePoints(0, maxLength));
	}
	}
	return value;
	}

	// copied from HiveBaseChar
	public static String getPaddedValue(String val, int maxLength, boolean rtrim) {
	if (val == null) {
	return null;
	}
	if (maxLength < 0) {
	return val;
	}

	int valLength = val.codePointCount(0, val.length());
	if (valLength > maxLength) {
	return enforceMaxLength(val, maxLength);
	}

	if (maxLength > valLength && rtrim == false) {
	// Make sure we pad the right amount of spaces; valLength is in terms of code points,
	// while StringUtils.rpad() is based on the number of java chars.
	int padLength = val.length() + (maxLength - valLength);
	val = StringUtils.rightPad(val, padLength);
	}
	return val;
	}

	public void testChar(int maxLength, boolean hasRTrim) throws Exception {
	// char(n)
	TypeDescription schema = TypeDescription.createChar().withMaxLength(maxLength);
	String[] expected = new String[utf8strs.length];
	for (int i = 0; i < utf8strs.length; i++) {
	expected[i] = getPaddedValue(utf8strs[i], maxLength, hasRTrim);
	}
	verifyWrittenStrings(schema, utf8strs, expected, maxLength);
	}

	public void testVarChar(int maxLength) throws Exception {
	// char(n)
	TypeDescription schema = TypeDescription.createVarchar().withMaxLength(maxLength);
	String[] expected = new String[utf8strs.length];
	for (int i = 0; i < utf8strs.length; i++) {
	expected[i] = enforceMaxLength(utf8strs[i], maxLength);
	}
	verifyWrittenStrings(schema, utf8strs, expected, maxLength);
	}

	public void verifyWrittenStrings(TypeDescription schema, String[] inputs, String[] expected, int maxLength)
	throws Exception {
	Writer writer =
	OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)
	.compress(CompressionKind.NONE).bufferSize(10000));
	VectorizedRowBatch batch = schema.createRowBatch();
	BytesColumnVector col = (BytesColumnVector) batch.cols[0];
	for (int i = 0; i < inputs.length; i++) {
	if (batch.size == batch.getMaxSize()) {
	writer.addRowBatch(batch);
	batch.reset();
	}
	col.setVal(batch.size++, inputs[i].getBytes(StandardCharsets.UTF_8));
	}
	writer.addRowBatch(batch);
	writer.close();

	Reader reader =
	OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
	RecordReader rows = reader.rows();
	batch = reader.getSchema().createRowBatch();
	col = (BytesColumnVector) batch.cols[0];
	int idx = 0;
	while (rows.nextBatch(batch)) {
	for (int r = 0; r < batch.size; ++r) {
	assertEquals(expected[idx], toString(col, r),
	String.format("test for %s:%d", schema, maxLength));
	idx++;
	}
	}
	fs.delete(testFilePath, false);
	}

	static String toString(BytesColumnVector vector, int row) {
	if (vector.isRepeating) {
	row = 0;
	}
	if (!vector.noNulls && vector.isNull[row]) {
	return null;
	}
	return new String(vector.vector[row], vector.start[row], vector.length[row],
	StandardCharsets.UTF_8);
	}
	}