fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 package org.apache.impala.catalog;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;

 import org.apache.impala.thrift.THdfsFileFormat;

 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;

 /**
  * Supported HDFS file formats. Every file format specifies:
  * 1) the input format class
  * 2) the output format class
  * 3) the serialization library class
  * 4) whether scanning complex types from it is supported
  * 5) whether the file format can skip complex columns in scans and just materialize
  *    scalar typed columns
  * 6) Indicates if the given file format supports Date type.
  *
  * Important note: Always keep consistent with the classes used in Hive.
  * TODO: Kudu doesn't belong in this list. Either rename this enum or create a separate
  * list of storage engines (see IMPALA-4178).
  */
 public enum HdfsFileFormat {
   RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
       "org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
       false, true, false),
   TEXT("org.apache.hadoop.mapred.TextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
       false, false, true),
   LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
       "", false, false, true),
   SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
       true, false),
   AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
       "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
       "org.apache.hadoop.hive.serde2.avro.AvroSerDe",
       false, false, true),
   PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
       true, true, true),
   ORC("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
       true, true, true),
   KUDU("org.apache.hadoop.hive.kudu.KuduInputFormat",
        "org.apache.hadoop.hive.kudu.KuduOutputFormat",
        "org.apache.hadoop.hive.kudu.KuduSerDe",
        false, false, false);

   private final String inputFormat_;
   private final String outputFormat_;
   private final String serializationLib_;

   // Indicates whether we support scanning complex types for this file format.
   private final boolean isComplexTypesSupported_;

   // Indicates whether the file format can skip complex columns in scans and just
   // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
   // TODO: Remove this once we support complex types for all file formats.
   private final boolean canSkipColumnTypes_;

   // Indicates whether we support scanning DATE type for this file format.
   private final boolean isDateTypeSupported_;

   HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
       boolean isComplexTypesSupported, boolean canSkipColumnTypes,
       boolean isDateTypeSupported) {
     inputFormat_ = inputFormat;
     outputFormat_ = outputFormat;
     serializationLib_ = serializationLib;
     isComplexTypesSupported_ = isComplexTypesSupported;
     canSkipColumnTypes_ = canSkipColumnTypes;
     isDateTypeSupported_ = isDateTypeSupported;
   }

   public String inputFormat() { return inputFormat_; }
   public String outputFormat() { return outputFormat_; }
   public String serializationLib() { return serializationLib_; }

   // Impala supports legacy Parquet input formats and treats them internally as the most
   // modern Parquet input format.
   private static final String[] PARQUET_LEGACY_INPUT_FORMATS = {
       "com.cloudera.impala.hive.serde.ParquetInputFormat",
       "parquet.hive.DeprecatedParquetInputFormat",
       "parquet.hive.MapredParquetInputFormat"
   };

   private static Map<String, HdfsFileFormat> VALID_INPUT_FORMATS =
       ImmutableMap.<String, HdfsFileFormat>builder()
       .put(RC_FILE.inputFormat(), RC_FILE)
       .put(TEXT.inputFormat(), TEXT)
       .put(LZO_TEXT.inputFormat(), TEXT)
       .put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
       .put(AVRO.inputFormat(), AVRO)
       .put(PARQUET.inputFormat(), PARQUET)
       .put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
       .put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
       .put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
       .put(KUDU.inputFormat(), KUDU)
       .put(ORC.inputFormat(), ORC).build();


   /**
    * Returns true if the string describes an input format class that we support.
    */
   public static boolean isHdfsInputFormatClass(String inputFormatClass) {
     return VALID_INPUT_FORMATS.containsKey(inputFormatClass);
   }

   /**
    * Returns the file format associated with the input format class, or null if
    * the input format class is not supported.
    */
   public static HdfsFileFormat fromHdfsInputFormatClass(String inputFormatClass) {
     Preconditions.checkNotNull(inputFormatClass);
     return VALID_INPUT_FORMATS.get(inputFormatClass);
   }

   /**
    * Returns the corresponding enum for a SerDe class name. If classname is not one
    * of our supported formats, throws an IllegalArgumentException like Enum.valueOf
    */
   public static HdfsFileFormat fromJavaClassName(String className) {
     Preconditions.checkNotNull(className);
     if (isHdfsInputFormatClass(className)) return VALID_INPUT_FORMATS.get(className);
     throw new IllegalArgumentException(className);
   }

   public static HdfsFileFormat fromThrift(THdfsFileFormat thriftFormat) {
     switch (thriftFormat) {
       case RC_FILE: return HdfsFileFormat.RC_FILE;
       case TEXT: return HdfsFileFormat.TEXT;
       case SEQUENCE_FILE: return HdfsFileFormat.SEQUENCE_FILE;
       case AVRO: return HdfsFileFormat.AVRO;
       case ORC: return HdfsFileFormat.ORC;
       case PARQUET: return HdfsFileFormat.PARQUET;
       case KUDU: return HdfsFileFormat.KUDU;
       default:
         throw new RuntimeException("Unknown THdfsFileFormat: "
             + thriftFormat + " - should never happen!");
     }
   }

   public THdfsFileFormat toThrift() {
     switch (this) {
       case RC_FILE: return THdfsFileFormat.RC_FILE;
       case TEXT: return THdfsFileFormat.TEXT;
       case SEQUENCE_FILE: return THdfsFileFormat.SEQUENCE_FILE;
       case AVRO: return THdfsFileFormat.AVRO;
       case ORC: return THdfsFileFormat.ORC;
       case PARQUET: return THdfsFileFormat.PARQUET;
       case KUDU: return THdfsFileFormat.KUDU;
       default:
         throw new RuntimeException("Unknown HdfsFormat: "
             + this + " - should never happen!");
     }
   }

   public String toSql(HdfsCompression compressionType) {
     switch (this) {
       case RC_FILE: return "RCFILE";
       case ORC: return "ORC";
       case TEXT:
         if (compressionType == HdfsCompression.LZO ||
             compressionType == HdfsCompression.LZO_INDEX) {
           // TODO: Update this when we can write LZO text.
           // It is not currently possible to create a table with LZO compressed text files
           // in Impala, but this is valid in Hive.
           return String.format("INPUTFORMAT '%s' OUTPUTFORMAT '%s'",
               LZO_TEXT.inputFormat(), LZO_TEXT.outputFormat());
         }
         return "TEXTFILE";
       case SEQUENCE_FILE: return "SEQUENCEFILE";
       case AVRO: return "AVRO";
       case PARQUET: return "PARQUET";
       case KUDU: return "KUDU";
       default:
         throw new RuntimeException("Unknown HdfsFormat: "
             + this + " - should never happen!");
     }
   }

   /**
    * Returns true if this file format with the given compression format is splittable.
    */
   public boolean isSplittable(HdfsCompression compression) {
     switch (this) {
       case TEXT:
         return compression == HdfsCompression.NONE;
       case RC_FILE:
       case SEQUENCE_FILE:
       case AVRO:
       case PARQUET:
       case ORC:
         return true;
       case KUDU:
         return false;
       default:
         throw new RuntimeException("Unknown HdfsFormat: "
             + this + " - should never happen!");
     }
   }

   /**
    * Returns true if Impala supports scanning complex-typed columns
    * from a table/partition with this file format.
    */
   public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }

   /**
    * Returns true if this file format can skip complex typed columns and materialize
    * only scalar typed columns.
    */
   public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }

   /**
    * Returns true if Impala supports scanning DATE typed columns from a table/partition of
    * this file format
    */
   public boolean isDateTypeSupported() { return isDateTypeSupported_; }

   /**
    * Returns a list with all formats for which isComplexTypesSupported() is true.
    */
   public static List<HdfsFileFormat> complexTypesFormats() {
     List<HdfsFileFormat> result = new ArrayList<>();
     for (HdfsFileFormat f: values()) {
       if (f.isComplexTypesSupported()) result.add(f);
     }
     return result;
   }
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	package org.apache.impala.catalog;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	import org.apache.impala.thrift.THdfsFileFormat;

	import com.google.common.base.Preconditions;
	import com.google.common.collect.ImmutableMap;

	/**
	* Supported HDFS file formats. Every file format specifies:
	* 1) the input format class
	* 2) the output format class
	* 3) the serialization library class
	* 4) whether scanning complex types from it is supported
	* 5) whether the file format can skip complex columns in scans and just materialize
	* scalar typed columns
	* 6) Indicates if the given file format supports Date type.
	*
	* Important note: Always keep consistent with the classes used in Hive.
	* TODO: Kudu doesn't belong in this list. Either rename this enum or create a separate
	* list of storage engines (see IMPALA-4178).
	*/
	public enum HdfsFileFormat {
	RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
	"org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
	"org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
	false, true, false),
	TEXT("org.apache.hadoop.mapred.TextInputFormat",
	"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
	"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
	false, false, true),
	LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
	"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
	"", false, false, true),
	SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
	"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
	"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
	true, false),
	AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
	"org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
	"org.apache.hadoop.hive.serde2.avro.AvroSerDe",
	false, false, true),
	PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
	"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
	"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
	true, true, true),
	ORC("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
	"org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
	"org.apache.hadoop.hive.ql.io.orc.OrcSerde",
	true, true, true),
	KUDU("org.apache.hadoop.hive.kudu.KuduInputFormat",
	"org.apache.hadoop.hive.kudu.KuduOutputFormat",
	"org.apache.hadoop.hive.kudu.KuduSerDe",
	false, false, false);

	private final String inputFormat_;
	private final String outputFormat_;
	private final String serializationLib_;

	// Indicates whether we support scanning complex types for this file format.
	private final boolean isComplexTypesSupported_;

	// Indicates whether the file format can skip complex columns in scans and just
	// materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
	// TODO: Remove this once we support complex types for all file formats.
	private final boolean canSkipColumnTypes_;

	// Indicates whether we support scanning DATE type for this file format.
	private final boolean isDateTypeSupported_;

	HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
	boolean isComplexTypesSupported, boolean canSkipColumnTypes,
	boolean isDateTypeSupported) {
	inputFormat_ = inputFormat;
	outputFormat_ = outputFormat;
	serializationLib_ = serializationLib;
	isComplexTypesSupported_ = isComplexTypesSupported;
	canSkipColumnTypes_ = canSkipColumnTypes;
	isDateTypeSupported_ = isDateTypeSupported;
	}

	public String inputFormat() { return inputFormat_; }
	public String outputFormat() { return outputFormat_; }
	public String serializationLib() { return serializationLib_; }

	// Impala supports legacy Parquet input formats and treats them internally as the most
	// modern Parquet input format.
	private static final String[] PARQUET_LEGACY_INPUT_FORMATS = {
	"com.cloudera.impala.hive.serde.ParquetInputFormat",
	"parquet.hive.DeprecatedParquetInputFormat",
	"parquet.hive.MapredParquetInputFormat"
	};

	private static Map<String, HdfsFileFormat> VALID_INPUT_FORMATS =
	ImmutableMap.<String, HdfsFileFormat>builder()
	.put(RC_FILE.inputFormat(), RC_FILE)
	.put(TEXT.inputFormat(), TEXT)
	.put(LZO_TEXT.inputFormat(), TEXT)
	.put(SEQUENCE_FILE.inputFormat(), SEQUENCE_FILE)
	.put(AVRO.inputFormat(), AVRO)
	.put(PARQUET.inputFormat(), PARQUET)
	.put(PARQUET_LEGACY_INPUT_FORMATS[0], PARQUET)
	.put(PARQUET_LEGACY_INPUT_FORMATS[1], PARQUET)
	.put(PARQUET_LEGACY_INPUT_FORMATS[2], PARQUET)
	.put(KUDU.inputFormat(), KUDU)
	.put(ORC.inputFormat(), ORC).build();


	/**
	* Returns true if the string describes an input format class that we support.
	*/
	public static boolean isHdfsInputFormatClass(String inputFormatClass) {
	return VALID_INPUT_FORMATS.containsKey(inputFormatClass);
	}

	/**
	* Returns the file format associated with the input format class, or null if
	* the input format class is not supported.
	*/
	public static HdfsFileFormat fromHdfsInputFormatClass(String inputFormatClass) {
	Preconditions.checkNotNull(inputFormatClass);
	return VALID_INPUT_FORMATS.get(inputFormatClass);
	}

	/**
	* Returns the corresponding enum for a SerDe class name. If classname is not one
	* of our supported formats, throws an IllegalArgumentException like Enum.valueOf
	*/
	public static HdfsFileFormat fromJavaClassName(String className) {
	Preconditions.checkNotNull(className);
	if (isHdfsInputFormatClass(className)) return VALID_INPUT_FORMATS.get(className);
	throw new IllegalArgumentException(className);
	}

	public static HdfsFileFormat fromThrift(THdfsFileFormat thriftFormat) {
	switch (thriftFormat) {
	case RC_FILE: return HdfsFileFormat.RC_FILE;
	case TEXT: return HdfsFileFormat.TEXT;
	case SEQUENCE_FILE: return HdfsFileFormat.SEQUENCE_FILE;
	case AVRO: return HdfsFileFormat.AVRO;
	case ORC: return HdfsFileFormat.ORC;
	case PARQUET: return HdfsFileFormat.PARQUET;
	case KUDU: return HdfsFileFormat.KUDU;
	default:
	throw new RuntimeException("Unknown THdfsFileFormat: "
	+ thriftFormat + " - should never happen!");
	}
	}

	public THdfsFileFormat toThrift() {
	switch (this) {
	case RC_FILE: return THdfsFileFormat.RC_FILE;
	case TEXT: return THdfsFileFormat.TEXT;
	case SEQUENCE_FILE: return THdfsFileFormat.SEQUENCE_FILE;
	case AVRO: return THdfsFileFormat.AVRO;
	case ORC: return THdfsFileFormat.ORC;
	case PARQUET: return THdfsFileFormat.PARQUET;
	case KUDU: return THdfsFileFormat.KUDU;
	default:
	throw new RuntimeException("Unknown HdfsFormat: "
	+ this + " - should never happen!");
	}
	}

	public String toSql(HdfsCompression compressionType) {
	switch (this) {
	case RC_FILE: return "RCFILE";
	case ORC: return "ORC";
	case TEXT:
	if (compressionType == HdfsCompression.LZO \|\|
	compressionType == HdfsCompression.LZO_INDEX) {
	// TODO: Update this when we can write LZO text.
	// It is not currently possible to create a table with LZO compressed text files
	// in Impala, but this is valid in Hive.
	return String.format("INPUTFORMAT '%s' OUTPUTFORMAT '%s'",
	LZO_TEXT.inputFormat(), LZO_TEXT.outputFormat());
	}
	return "TEXTFILE";
	case SEQUENCE_FILE: return "SEQUENCEFILE";
	case AVRO: return "AVRO";
	case PARQUET: return "PARQUET";
	case KUDU: return "KUDU";
	default:
	throw new RuntimeException("Unknown HdfsFormat: "
	+ this + " - should never happen!");
	}
	}

	/**
	* Returns true if this file format with the given compression format is splittable.
	*/
	public boolean isSplittable(HdfsCompression compression) {
	switch (this) {
	case TEXT:
	return compression == HdfsCompression.NONE;
	case RC_FILE:
	case SEQUENCE_FILE:
	case AVRO:
	case PARQUET:
	case ORC:
	return true;
	case KUDU:
	return false;
	default:
	throw new RuntimeException("Unknown HdfsFormat: "
	+ this + " - should never happen!");
	}
	}

	/**
	* Returns true if Impala supports scanning complex-typed columns
	* from a table/partition with this file format.
	*/
	public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }

	/**
	* Returns true if this file format can skip complex typed columns and materialize
	* only scalar typed columns.
	*/
	public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }

	/**
	* Returns true if Impala supports scanning DATE typed columns from a table/partition of
	* this file format
	*/
	public boolean isDateTypeSupported() { return isDateTypeSupported_; }

	/**
	* Returns a list with all formats for which isComplexTypesSupported() is true.
	*/
	public static List<HdfsFileFormat> complexTypesFormats() {
	List<HdfsFileFormat> result = new ArrayList<>();
	for (HdfsFileFormat f: values()) {
	if (f.isComplexTypesSupported()) result.add(f);
	}
	return result;
	}
	}