ORC-1643: Add test for `scan` command ### What changes were proposed in this pull request? Add unit test for `scan` command of `orc-tools`. ### Why are the changes needed? Improve code test coverage and avoid regressions. ### How was this patch tested? Add UT ### Was this patch authored or co-authored using generative AI tooling? No Closes #1833 from cxzl25/ORC-1643. Authored-by: sychen <sychen@ctrip.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>

commit: 01ebb961ba30f25efd33777a5220225feedc45c2 [log] [tgz]
author: sychen <sychen@ctrip.com> Mon Mar 04 11:28:27 2024 -0800
committer: Dongjoon Hyun <dongjoon@apache.org> Mon Mar 04 11:28:27 2024 -0800
tree: 64a7d26075413b07b54d017ac6bbdc1c881c6b17
parent: 6c24acdbf232b5323b7138d93b51806f3fa9fb01 [diff]
diff --git a/java/tools/src/test/org/apache/orc/tools/TesScanData.java b/java/tools/src/test/org/apache/orc/tools/TesScanData.java
new file mode 100644
index 0000000..df73abc
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/TesScanData.java

@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TesScanData {
+  private Path workDir = new Path(System.getProperty("test.tmp.dir"));
+  private Configuration conf;
+  private FileSystem fs;
+  private Path testFilePath;
+
+  @BeforeEach
+  public void openFileSystem() throws Exception {
+    conf = new Configuration();
+    fs = FileSystem.getLocal(conf);
+    fs.setWorkingDirectory(workDir);
+    testFilePath = new Path("TesScanData.testScan.orc");
+    fs.delete(testFilePath, false);
+  }
+
+  @Test
+  public void testScan() throws Exception {
+    TypeDescription schema = TypeDescription.fromString("struct<x:int,y:string>");
+    Writer writer = OrcFile.createWriter(testFilePath,
+        OrcFile.writerOptions(conf)
+            .setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    LongColumnVector x = (LongColumnVector) batch.cols[0];
+    BytesColumnVector y = (BytesColumnVector) batch.cols[1];
+    for (int r = 0; r < 10000; ++r) {
+      int row = batch.size++;
+      x.vector[row] = r;
+      byte[] buffer = ("byte-" + r).getBytes();
+      y.setRef(row, buffer, 0, buffer.length);
+      if (batch.size == batch.getMaxSize()) {
+        writer.addRowBatch(batch);
+        batch.reset();
+      }
+    }
+    if (batch.size != 0) {
+      writer.addRowBatch(batch);
+    }
+    writer.close();
+
+    PrintStream origOut = System.out;
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8));
+    ScanData.main(conf, new String[]{"--schema", testFilePath.toString()});
+    System.out.flush();
+    System.setOut(origOut);
+    String output = myOut.toString(StandardCharsets.UTF_8);
+    assertTrue(output.contains("{\"category\": \"struct\", \"id\": 0, \"max\": 2, \"fields\": [\n" +
+        "{  \"x\": {\"category\": \"int\", \"id\": 1, \"max\": 1}},\n" +
+        "{  \"y\": {\"category\": \"string\", \"id\": 2, \"max\": 2}}]}"));
+    assertTrue(output.contains("File: TesScanData.testScan.orc, bad batches: 0, rows: 10000/10000"));
+  }
+}
commit	01ebb961ba30f25efd33777a5220225feedc45c2	[log] [tgz]
author	sychen <sychen@ctrip.com>	Mon Mar 04 11:28:27 2024 -0800
committer	Dongjoon Hyun <dongjoon@apache.org>	Mon Mar 04 11:28:27 2024 -0800
tree	64a7d26075413b07b54d017ac6bbdc1c881c6b17
parent	6c24acdbf232b5323b7138d93b51806f3fa9fb01 [diff]