ORC-628. Add tool to count number of rows in ORC files under a directory.
Fixes #506
Signed-off-by: Owen O'Malley <omalley@apache.org>
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 8b16876..b9057ae 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -86,13 +86,14 @@
" [--define X=Y] <command> <args>");
System.err.println();
System.err.println("Commands:");
- System.err.println(" version - print the version of this ORC tool");
- System.err.println(" meta - print the metadata about the ORC file");
- System.err.println(" data - print the data from the ORC file");
- System.err.println(" scan - scan the ORC file");
System.err.println(" convert - convert CSV and JSON files to ORC");
+ System.err.println(" count - recursively find *.orc and print the number of rows");
+ System.err.println(" data - print the data from the ORC file");
System.err.println(" json-schema - scan JSON files to determine their schema");
System.err.println(" key - print information about the keys");
+ System.err.println(" meta - print the metadata about the ORC file");
+ System.err.println(" scan - scan the ORC file");
+ System.err.println(" version - print the version of this ORC tool");
System.err.println();
System.err.println("To get more help, provide -h to the command");
System.exit(1);
@@ -102,23 +103,34 @@
for(Map.Entry pair: confSettings.entrySet()) {
conf.set(pair.getKey().toString(), pair.getValue().toString());
}
- if ("version".equals(options.command)) {
- PrintVersion.main(conf, options.commandArgs);
- } else if ("meta".equals(options.command)) {
- FileDump.main(conf, options.commandArgs);
- } else if ("data".equals(options.command)) {
- PrintData.main(conf, options.commandArgs);
- } else if ("scan".equals(options.command)) {
- ScanData.main(conf, options.commandArgs);
- } else if ("json-schema".equals(options.command)) {
- JsonSchemaFinder.main(conf, options.commandArgs);
- } else if ("convert".equals(options.command)) {
- ConvertTool.main(conf, options.commandArgs);
- } else if ("key".equals(options.command)) {
- KeyTool.main(conf, options.commandArgs);
- } else {
- System.err.println("Unknown subcommand: " + options.command);
- System.exit(1);
+ switch (options.command) {
+ case "convert":
+ ConvertTool.main(conf, options.commandArgs);
+ break;
+ case "count":
+ RowCount.main(conf, options.commandArgs);
+ break;
+ case "data":
+ PrintData.main(conf, options.commandArgs);
+ break;
+ case "json-schema":
+ JsonSchemaFinder.main(conf, options.commandArgs);
+ break;
+ case "key":
+ KeyTool.main(conf, options.commandArgs);
+ break;
+ case "meta":
+ FileDump.main(conf, options.commandArgs);
+ break;
+ case "scan":
+ ScanData.main(conf, options.commandArgs);
+ break;
+ case "version":
+ PrintVersion.main(conf, options.commandArgs);
+ break;
+ default:
+ System.err.println("Unknown subcommand: " + options.command);
+ System.exit(1);
}
}
}
diff --git a/java/tools/src/java/org/apache/orc/tools/RowCount.java b/java/tools/src/java/org/apache/orc/tools/RowCount.java
new file mode 100644
index 0000000..efe0ffe
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/RowCount.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+
+/**
+ * Given a set of paths, finds all of the "*.orc" files under them and prints the number of rows in each file.
+ */
+public class RowCount {
+ public static void main(Configuration conf, String[] args) throws IOException {
+ int bad = 0;
+ for(String root: args) {
+ Path rootPath = new Path(root);
+ FileSystem fs = rootPath.getFileSystem(conf);
+ for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
+ LocatedFileStatus status = itr.next();
+ if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+ Path filename = status.getPath();
+ try (Reader reader = OrcFile.createReader(filename, OrcFile.readerOptions(conf))) {
+ System.out.println(String.format("%s %d", filename.toString(), reader.getNumberOfRows()));
+ } catch (IOException ioe) {
+ bad += 1;
+ System.err.println("Failed to read " + filename);
+ }
+ }
+ }
+ }
+ System.exit(bad == 0 ? 0 : 1);
+ }
+
+ public static void main(String[] args) throws IOException {
+ main(new Configuration(), args);
+ }
+}
\ No newline at end of file