testdata/TableFlattener/src/main/java/org/apache/impala/infra/tableflattener/Main.java - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 package org.apache.impala.infra.tableflattener;

 import org.apache.avro.generic.GenericRecord;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.ParseException;
 import org.apache.commons.cli.PosixParser;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.kitesdk.data.CompressionType;
 import org.kitesdk.data.Dataset;
 import org.kitesdk.data.DatasetDescriptor;
 import org.kitesdk.data.Datasets;
 import org.kitesdk.data.Format;
 import org.kitesdk.data.Formats;
 import parquet.avro.AvroSchemaConverter;
 import parquet.hadoop.ParquetFileReader;
 import parquet.hadoop.metadata.ParquetMetadata;

 import java.io.IOException;
 import java.io.PrintWriter;
 import java.net.URI;
 import java.nio.file.Files;
 import java.nio.file.Path;

 public class Main {

   Options cliOptions_;
   DatasetDescriptor datasetDescr_;

   // The dir to write the flat datasets to. The dir should either not exist or be
   // empty. The URI can either point to a local dir or an HDFS dir.
   URI outputDir_;
   CommandLine commandLine_;

   @SuppressWarnings("static-access")
   void parseArgs(String[] args) throws ParseException, IOException {
     cliOptions_ = new Options();
     cliOptions_.addOption(OptionBuilder.withLongOpt("help").create("h"));
     cliOptions_.addOption(OptionBuilder
         .hasArg()
         .withLongOpt("input-data-format")
         .withDescription("The format of the input file. Ex, avro")
         .create("f"));
     cliOptions_.addOption(OptionBuilder
         .hasArg()
         .withLongOpt("input-data-compression")
         .withDescription("The compression type of the input file. Ex, snappy")
         .create("c"));
     cliOptions_.addOption(OptionBuilder
         .hasArg()
         .withLongOpt("input-schema-uri")
         .withDescription("The URI of the input file's schema. Ex, file://foo.avsc")
         .create("s"));
     CommandLineParser parser = new PosixParser();
     commandLine_ = parser.parse(cliOptions_, args);

     if (commandLine_.hasOption("h")) printHelp();

     DatasetDescriptor.Builder datasetDescrBuilder = new DatasetDescriptor.Builder();

     String[] dataArgs = commandLine_.getArgs();
     if (dataArgs.length != 2) {
       printHelp("Exactly two arguments are required");
     }

     URI dataFile = URI.create(dataArgs[0]);
     outputDir_ = URI.create(dataArgs[1]);
     datasetDescrBuilder.location(dataFile);

     Format inputFormat;
     if (commandLine_.hasOption("f")) {
       inputFormat = Formats.fromString(commandLine_.getOptionValue("f"));
     } else {
       String dataFilePath = dataFile.getPath();
       if (dataFilePath == null || dataFilePath.isEmpty()) {
         printHelp("Data file URI is missing a path component: " + dataFile.toString());
       }
       String ext = FilenameUtils.getExtension(dataFilePath);
       if (ext.isEmpty()) {
         printHelp("The file format (-f) must be specified");
       }
       inputFormat = Formats.fromString(ext);
     }
     datasetDescrBuilder.format(inputFormat);

     if (commandLine_.hasOption("c")) {
       datasetDescrBuilder.compressionType(
           CompressionType.forName(commandLine_.getOptionValue("c")));
     }

     if (commandLine_.hasOption("s")) {
       datasetDescrBuilder.schemaUri(commandLine_.getOptionValue("s"));
     } else if (inputFormat == Formats.AVRO) {
       datasetDescrBuilder.schemaFromAvroDataFile(dataFile);
     } else if (inputFormat == Formats.PARQUET) {
       ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(
           new Configuration(), new org.apache.hadoop.fs.Path(dataFile));
       datasetDescrBuilder.schema(new AvroSchemaConverter().convert(
           parquetMetadata.getFileMetaData().getSchema()));
     } else {
       printHelp("A schema (-s) is required for data format " + inputFormat.getName());
     }

     datasetDescr_ = datasetDescrBuilder.build();
   }

   void printHelp() { printHelp(""); }

   void printHelp(String errorMessage) {
     PrintWriter printer = new PrintWriter(
         errorMessage.isEmpty() ? System.out : System.err);
     if (!errorMessage.isEmpty()) printer.println("Error: " + errorMessage + "\n");
     printer.println("Usage: [options] <input uri> <output uri>\n\n" +
         "input uri    The URI to the input file.\n" +
         "               Ex, file:///foo.avro or hdfs://localhost:20500/foo.avro\n" +
         "output uri   The URI to the output directory. The dir must either not\n" +
         "               exist or it must be empty.\n" +
         "               Ex, file:///bar or hdfs://localhost:20500/bar\n\n" +
         "Options:");
     new HelpFormatter().printOptions(printer, 80, cliOptions_, 1 , 3);
     printer.close();
     System.exit(errorMessage.isEmpty() ? 0 : 1);
   }

   void exec(String[] args) throws ParseException, IOException {
     Logger.getRootLogger().setLevel(Level.OFF);
     parseArgs(args);

     SchemaFlattener schemaFlattener = new SchemaFlattener(outputDir_);
     FlattenedSchema rootDataset =
         schemaFlattener.flatten(datasetDescr_.getSchema());

     Path tempDatasetPath = Files.createTempDirectory(null);
     try {
       Dataset<GenericRecord> srcDataset = Datasets.create(
           "dataset:file:" + tempDatasetPath.toString(), datasetDescr_);
       FileMigrator migrator = new FileMigrator();
       migrator.migrate(srcDataset, rootDataset);
     } finally {
       FileUtils.deleteDirectory(tempDatasetPath.toFile());
     }
   }

   public static void main(String[] args) throws Exception {
     new Main().exec(args);
   }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	package org.apache.impala.infra.tableflattener;

	import org.apache.avro.generic.GenericRecord;
	import org.apache.commons.cli.CommandLine;
	import org.apache.commons.cli.CommandLineParser;
	import org.apache.commons.cli.HelpFormatter;
	import org.apache.commons.cli.OptionBuilder;
	import org.apache.commons.cli.Options;
	import org.apache.commons.cli.ParseException;
	import org.apache.commons.cli.PosixParser;
	import org.apache.commons.io.FileUtils;
	import org.apache.commons.io.FilenameUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.log4j.Level;
	import org.apache.log4j.Logger;
	import org.kitesdk.data.CompressionType;
	import org.kitesdk.data.Dataset;
	import org.kitesdk.data.DatasetDescriptor;
	import org.kitesdk.data.Datasets;
	import org.kitesdk.data.Format;
	import org.kitesdk.data.Formats;
	import parquet.avro.AvroSchemaConverter;
	import parquet.hadoop.ParquetFileReader;
	import parquet.hadoop.metadata.ParquetMetadata;

	import java.io.IOException;
	import java.io.PrintWriter;
	import java.net.URI;
	import java.nio.file.Files;
	import java.nio.file.Path;

	public class Main {

	Options cliOptions_;
	DatasetDescriptor datasetDescr_;

	// The dir to write the flat datasets to. The dir should either not exist or be
	// empty. The URI can either point to a local dir or an HDFS dir.
	URI outputDir_;
	CommandLine commandLine_;

	@SuppressWarnings("static-access")
	void parseArgs(String[] args) throws ParseException, IOException {
	cliOptions_ = new Options();
	cliOptions_.addOption(OptionBuilder.withLongOpt("help").create("h"));
	cliOptions_.addOption(OptionBuilder
	.hasArg()
	.withLongOpt("input-data-format")
	.withDescription("The format of the input file. Ex, avro")
	.create("f"));
	cliOptions_.addOption(OptionBuilder
	.hasArg()
	.withLongOpt("input-data-compression")
	.withDescription("The compression type of the input file. Ex, snappy")
	.create("c"));
	cliOptions_.addOption(OptionBuilder
	.hasArg()
	.withLongOpt("input-schema-uri")
	.withDescription("The URI of the input file's schema. Ex, file://foo.avsc")
	.create("s"));
	CommandLineParser parser = new PosixParser();
	commandLine_ = parser.parse(cliOptions_, args);

	if (commandLine_.hasOption("h")) printHelp();

	DatasetDescriptor.Builder datasetDescrBuilder = new DatasetDescriptor.Builder();

	String[] dataArgs = commandLine_.getArgs();
	if (dataArgs.length != 2) {
	printHelp("Exactly two arguments are required");
	}

	URI dataFile = URI.create(dataArgs[0]);
	outputDir_ = URI.create(dataArgs[1]);
	datasetDescrBuilder.location(dataFile);

	Format inputFormat;
	if (commandLine_.hasOption("f")) {
	inputFormat = Formats.fromString(commandLine_.getOptionValue("f"));
	} else {
	String dataFilePath = dataFile.getPath();
	if (dataFilePath == null \|\| dataFilePath.isEmpty()) {
	printHelp("Data file URI is missing a path component: " + dataFile.toString());
	}
	String ext = FilenameUtils.getExtension(dataFilePath);
	if (ext.isEmpty()) {
	printHelp("The file format (-f) must be specified");
	}
	inputFormat = Formats.fromString(ext);
	}
	datasetDescrBuilder.format(inputFormat);

	if (commandLine_.hasOption("c")) {
	datasetDescrBuilder.compressionType(
	CompressionType.forName(commandLine_.getOptionValue("c")));
	}

	if (commandLine_.hasOption("s")) {
	datasetDescrBuilder.schemaUri(commandLine_.getOptionValue("s"));
	} else if (inputFormat == Formats.AVRO) {
	datasetDescrBuilder.schemaFromAvroDataFile(dataFile);
	} else if (inputFormat == Formats.PARQUET) {
	ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(
	new Configuration(), new org.apache.hadoop.fs.Path(dataFile));
	datasetDescrBuilder.schema(new AvroSchemaConverter().convert(
	parquetMetadata.getFileMetaData().getSchema()));
	} else {
	printHelp("A schema (-s) is required for data format " + inputFormat.getName());
	}

	datasetDescr_ = datasetDescrBuilder.build();
	}

	void printHelp() { printHelp(""); }

	void printHelp(String errorMessage) {
	PrintWriter printer = new PrintWriter(
	errorMessage.isEmpty() ? System.out : System.err);
	if (!errorMessage.isEmpty()) printer.println("Error: " + errorMessage + "\n");
	printer.println("Usage: [options] <input uri> <output uri>\n\n" +
	"input uri The URI to the input file.\n" +
	" Ex, file:///foo.avro or hdfs://localhost:20500/foo.avro\n" +
	"output uri The URI to the output directory. The dir must either not\n" +
	" exist or it must be empty.\n" +
	" Ex, file:///bar or hdfs://localhost:20500/bar\n\n" +
	"Options:");
	new HelpFormatter().printOptions(printer, 80, cliOptions_, 1 , 3);
	printer.close();
	System.exit(errorMessage.isEmpty() ? 0 : 1);
	}

	void exec(String[] args) throws ParseException, IOException {
	Logger.getRootLogger().setLevel(Level.OFF);
	parseArgs(args);

	SchemaFlattener schemaFlattener = new SchemaFlattener(outputDir_);
	FlattenedSchema rootDataset =
	schemaFlattener.flatten(datasetDescr_.getSchema());

	Path tempDatasetPath = Files.createTempDirectory(null);
	try {
	Dataset<GenericRecord> srcDataset = Datasets.create(
	"dataset:file:" + tempDatasetPath.toString(), datasetDescr_);
	FileMigrator migrator = new FileMigrator();
	migrator.migrate(srcDataset, rootDataset);
	} finally {
	FileUtils.deleteDirectory(tempDatasetPath.toFile());
	}
	}

	public static void main(String[] args) throws Exception {
	new Main().exec(args);
	}
	}