blob: 9515ff43494b09d7fd73d2b2d43a63ef4fc9e8d7 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.impala.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.impala.catalog.MapType;
import org.apache.impala.catalog.ScalarType;
import org.apache.impala.catalog.StructField;
import org.apache.impala.catalog.ArrayType;
import org.apache.impala.catalog.StructType;
import org.apache.impala.catalog.Type;
import org.apache.impala.common.AnalysisException;
import org.apache.impala.common.FileSystemUtil;
import org.apache.impala.util.FileAnalysisUtil;
import org.apache.orc.OrcFile;
import org.apache.orc.OrcFile.ReaderOptions;
import org.apache.orc.Reader;
import org.apache.orc.TypeDescription;
import org.apache.orc.TypeDescription.Category;
import com.google.common.base.Preconditions;
/**
* Provides a helper function (extract()) which extracts the Impala schema from a given
* ORC file. Details of the ORC types:
* https://orc.apache.org/docs/types.html
*/
public class OrcSchemaExtractor {
private final static String ERROR_MSG =
"Failed to convert ORC type\n%s\nto an Impala %s type:\n%s\n";
/**
* Validates the path and loads the ORC schema of the file. The ORC schema is also an
* ORC type (TypeDescription), represented as a struct.
*/
private static TypeDescription loadOrcSchema(Path pathToFile) throws AnalysisException {
FileAnalysisUtil.CheckIfFile(pathToFile);
Reader reader = null;
try {
reader = OrcFile.createReader(pathToFile,
new ReaderOptions(FileSystemUtil.getConfiguration()));
} catch (IOException e) {
// OrcFile.createReader throws IOException in case of any failure, including trying
// to open a non-ORC file.
throw new AnalysisException("Failed to open file as an ORC file: " + e);
}
return reader.getSchema();
}
/**
* Converts a primitive ORC type to an Impala Type.
*/
static private Type convertPrimitiveOrcType(TypeDescription type) {
Category category = type.getCategory();
Preconditions.checkState(category.isPrimitive());
switch (category) {
case BINARY: return Type.STRING;
case BOOLEAN: return Type.BOOLEAN;
case BYTE: return Type.TINYINT;
case CHAR: return ScalarType.createCharType(type.getMaxLength());
case DATE: return Type.DATE;
case DECIMAL:
return ScalarType.createDecimalType(type.getPrecision(), type.getScale());
case DOUBLE: return Type.DOUBLE;
case FLOAT: return Type.FLOAT;
case INT: return Type.INT;
case LONG: return Type.BIGINT;
case SHORT: return Type.SMALLINT;
case STRING: return Type.STRING;
case TIMESTAMP: return Type.TIMESTAMP;
case VARCHAR: return ScalarType.createVarcharType(type.getMaxLength());
default:
Preconditions.checkState(false,
"Unexpected ORC primitive type: " + category.getName());
return null;
}
}
/**
* Converts an ORC list type to an Impala array Type. An ORC list contains one child,
* the TypeDescription of the elements.
*/
private static ArrayType convertArray(TypeDescription listType)
throws AnalysisException {
Preconditions.checkState(listType.getChildren().size() == 1);
return new ArrayType(convertOrcType(listType.getChildren().get(0)));
}
/**
* Converts an ORC map type to an Impala map Type. An ORC map contains two children,
* the TypeDescriptions for the keys and values.
*/
private static MapType convertMap(TypeDescription mapType) throws AnalysisException {
// ORC maps have two children, one for the keys, one for the values.
Preconditions.checkState(mapType.getChildren().size() == 2);
TypeDescription key = mapType.getChildren().get(0);
TypeDescription value = mapType.getChildren().get(1);
if (!key.getCategory().isPrimitive()) {
throw new AnalysisException(String.format(ERROR_MSG, mapType.toString(), "MAP",
"The key type of the MAP type must be primitive."));
}
return new MapType(convertOrcType(key), convertOrcType(value));
}
/**
* Converts an ORC struct type to an Impala struct Type.
*/
private static StructType convertStruct(TypeDescription structType)
throws AnalysisException {
List<StructField> structFields = new ArrayList<>();
List<String> fieldNames = structType.getFieldNames();
List<TypeDescription> subTypes = structType.getChildren();
Preconditions.checkState(subTypes.size() == fieldNames.size());
for (int i = 0; i < subTypes.size(); i++) {
StructField f = new StructField(fieldNames.get(i), convertOrcType(subTypes.get(i)));
structFields.add(f);
}
return new StructType(structFields);
}
/**
* Converts a non-primitive ORC type to an Impala Type.
*/
static private Type convertComplexOrcType(TypeDescription type)
throws AnalysisException {
Category category = type.getCategory();
Preconditions.checkState(!category.isPrimitive());
switch (category) {
case LIST: return convertArray(type);
case MAP: return convertMap(type);
case STRUCT: return convertStruct(type);
case UNION:
throw new AnalysisException(
"Unsupported ORC type UNION for field " + category.getName());
default:
Preconditions.checkState(false,
"Unexpected ORC primitive type: " + category.getName());
return null;
}
}
/**
* Converts an ORC type to an Impala Type.
*/
static private Type convertOrcType(TypeDescription type) throws AnalysisException {
if (type.getCategory().isPrimitive()) {
return convertPrimitiveOrcType(type);
} else {
return convertComplexOrcType(type);
}
}
/**
* Parses an ORC file stored in HDFS and returns the corresponding Impala schema.
* This fails with an analysis exception if any errors occur reading the file,
* parsing the ORC schema, or if the ORC types cannot be represented in Impala.
*/
static public List<ColumnDef> extract(HdfsUri location) throws AnalysisException {
List<ColumnDef> schema = new ArrayList<>();
TypeDescription orcSchema = loadOrcSchema(location.getPath()); // Returns a STRUCT.
List<TypeDescription> subTypes = orcSchema.getChildren();
List<String> fieldNames = orcSchema.getFieldNames();
Preconditions.checkState(subTypes.size() == fieldNames.size());
for (int i = 0; i < subTypes.size(); i++) {
TypeDescription orcType = subTypes.get(i);
Type type = convertOrcType(orcType);
Preconditions.checkNotNull(type);
String colName = fieldNames.get(i);
Map<ColumnDef.Option, Object> option = new HashMap<>();
option.put(ColumnDef.Option.COMMENT, "Inferred from ORC file.");
schema.add(new ColumnDef(colName, new TypeDef(type), option));
}
return schema;
}
}