blob: e6f10a07901aed2849ba47a4c4bcac3cb40a1eb4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.SchemaEvolution;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.apache.hadoop.util.StringUtils.COMMA_STR;
public class OrcUtils {
/**
* Returns selected columns as a boolean array with true value set for specified column names.
* The result will contain number of elements equal to flattened number of columns.
* For example:
* selectedColumns - a,b,c
* allColumns - a,b,c,d
* If column c is a complex type, say list<string> and other types are
* primitives then result will
* be [false, true, true, true, true, true, false]
* Index 0 is the root element of the struct which is set to false by default, index 1,2
* corresponds to columns a and b. Index 3,4 correspond to column c which is list<string> and
* index 5 correspond to column d. After flattening list<string> gets 2 columns.
*
* Column names that aren't found are ignored.
* @param selectedColumns - comma separated list of selected column names
* @param schema - object schema
* @return - boolean array with true value set for the specified column names
*/
public static boolean[] includeColumns(String selectedColumns,
TypeDescription schema) {
int numFlattenedCols = schema.getMaximumId();
boolean[] results = new boolean[numFlattenedCols + 1];
if ("*".equals(selectedColumns)) {
Arrays.fill(results, true);
return results;
}
TypeDescription baseSchema = SchemaEvolution.checkAcidSchema(schema) ?
SchemaEvolution.getBaseRow(schema) : schema;
if (selectedColumns != null &&
baseSchema.getCategory() == TypeDescription.Category.STRUCT) {
for (String columnName : selectedColumns.split(COMMA_STR)) {
TypeDescription column = findColumn(baseSchema, columnName.trim());
if (column != null) {
for (int i = column.getId(); i <= column.getMaximumId(); ++i) {
results[i] = true;
}
}
}
}
return results;
}
private static TypeDescription findColumn(TypeDescription schema, String column) {
TypeDescription result = schema;
String[] columnMatcher = column.split("\\.");
int index = 0;
while (index < columnMatcher.length &&
result.getCategory() == TypeDescription.Category.STRUCT) {
String columnName = columnMatcher[index];
int prevIndex = index;
List<TypeDescription> fields = result.getChildren();
List<String> fieldNames = result.getFieldNames();
for (int i = 0; i < fields.size(); i++) {
if (columnName.equalsIgnoreCase(fieldNames.get(i))) {
result = fields.get(i);
index++;
break;
}
}
if (prevIndex == index) {
return null;
}
}
return result;
}
public static List<OrcProto.Type> getOrcTypes(TypeDescription typeDescr) {
List<OrcProto.Type> result = new ArrayList<>();
appendOrcTypes(result, typeDescr);
return result;
}
private static void appendOrcTypes(List<OrcProto.Type> result, TypeDescription typeDescr) {
OrcProto.Type.Builder type = OrcProto.Type.newBuilder();
List<TypeDescription> children = typeDescr.getChildren();
// save the attributes
for(String key: typeDescr.getAttributeNames()) {
type.addAttributes(
OrcProto.StringPair.newBuilder()
.setKey(key).setValue(typeDescr.getAttributeValue(key))
.build());
}
switch (typeDescr.getCategory()) {
case BOOLEAN:
type.setKind(OrcProto.Type.Kind.BOOLEAN);
break;
case BYTE:
type.setKind(OrcProto.Type.Kind.BYTE);
break;
case SHORT:
type.setKind(OrcProto.Type.Kind.SHORT);
break;
case INT:
type.setKind(OrcProto.Type.Kind.INT);
break;
case LONG:
type.setKind(OrcProto.Type.Kind.LONG);
break;
case FLOAT:
type.setKind(OrcProto.Type.Kind.FLOAT);
break;
case DOUBLE:
type.setKind(OrcProto.Type.Kind.DOUBLE);
break;
case STRING:
type.setKind(OrcProto.Type.Kind.STRING);
break;
case CHAR:
type.setKind(OrcProto.Type.Kind.CHAR);
type.setMaximumLength(typeDescr.getMaxLength());
break;
case VARCHAR:
type.setKind(OrcProto.Type.Kind.VARCHAR);
type.setMaximumLength(typeDescr.getMaxLength());
break;
case BINARY:
type.setKind(OrcProto.Type.Kind.BINARY);
break;
case TIMESTAMP:
type.setKind(OrcProto.Type.Kind.TIMESTAMP);
break;
case TIMESTAMP_INSTANT:
type.setKind(OrcProto.Type.Kind.TIMESTAMP_INSTANT);
break;
case DATE:
type.setKind(OrcProto.Type.Kind.DATE);
break;
case DECIMAL:
type.setKind(OrcProto.Type.Kind.DECIMAL);
type.setPrecision(typeDescr.getPrecision());
type.setScale(typeDescr.getScale());
break;
case LIST:
type.setKind(OrcProto.Type.Kind.LIST);
type.addSubtypes(children.get(0).getId());
break;
case MAP:
type.setKind(OrcProto.Type.Kind.MAP);
for(TypeDescription t: children) {
type.addSubtypes(t.getId());
}
break;
case STRUCT:
type.setKind(OrcProto.Type.Kind.STRUCT);
for(TypeDescription t: children) {
type.addSubtypes(t.getId());
}
for(String field: typeDescr.getFieldNames()) {
type.addFieldNames(field);
}
break;
case UNION:
type.setKind(OrcProto.Type.Kind.UNION);
for(TypeDescription t: children) {
type.addSubtypes(t.getId());
}
break;
default:
throw new IllegalArgumentException("Unknown category: " +
typeDescr.getCategory());
}
result.add(type.build());
if (children != null) {
for(TypeDescription child: children) {
appendOrcTypes(result, child);
}
}
}
/**
* Checks whether the list of protobuf types from the file are valid or not.
* @param types the list of types from the protobuf
* @param root the top of the tree to check
* @return the next available id
* @throws java.io.IOException if the tree is invalid
*/
public static int isValidTypeTree(List<OrcProto.Type> types,
int root) throws IOException {
if (root < 0 || root >= types.size()) {
throw new IOException("Illegal type id " + root +
". The valid range is 0 to " + (types.size() - 1));
}
OrcProto.Type rootType = types.get(root);
int current = root+1;
List<Integer> children = rootType.getSubtypesList();
if (!rootType.hasKind()) {
throw new IOException("Type " + root + " has an unknown kind.");
}
// ensure that we have the right number of children
switch(rootType.getKind()) {
case LIST:
if (children == null || children.size() != 1) {
throw new IOException("Wrong number of type children in list " + root);
}
break;
case MAP:
if (children == null || children.size() != 2) {
throw new IOException("Wrong number of type children in map " + root);
}
break;
case UNION:
case STRUCT:
break;
default:
if (children != null && children.size() != 0) {
throw new IOException("Type children under primitive type " + root);
}
}
// ensure the children are also correct
if (children != null) {
for(int child: children) {
if (child != current) {
throw new IOException("Unexpected child type id " + child + " when " +
current + " was expected.");
}
current = isValidTypeTree(types, current);
}
}
return current;
}
/**
* Translate the given rootColumn from the list of types to a TypeDescription.
* @param types all of the types
* @param rootColumn translate this type
* @return a new TypeDescription that matches the given rootColumn
*/
public static
TypeDescription convertTypeFromProtobuf(List<OrcProto.Type> types,
int rootColumn)
throws FileFormatException {
OrcProto.Type type = types.get(rootColumn);
TypeDescription result;
switch (type.getKind()) {
case BOOLEAN:
result = TypeDescription.createBoolean();
break;
case BYTE:
result = TypeDescription.createByte();
break;
case SHORT:
result = TypeDescription.createShort();
break;
case INT:
result = TypeDescription.createInt();
break;
case LONG:
result = TypeDescription.createLong();
break;
case FLOAT:
result = TypeDescription.createFloat();
break;
case DOUBLE:
result = TypeDescription.createDouble();
break;
case STRING:
result = TypeDescription.createString();
break;
case CHAR:
case VARCHAR: {
result = type.getKind() == OrcProto.Type.Kind.CHAR ?
TypeDescription.createChar() : TypeDescription.createVarchar();
if (type.hasMaximumLength()) {
result.withMaxLength(type.getMaximumLength());
}
}
break;
case BINARY:
result = TypeDescription.createBinary();
break;
case TIMESTAMP:
result = TypeDescription.createTimestamp();
break;
case TIMESTAMP_INSTANT:
result = TypeDescription.createTimestampInstant();
break;
case DATE:
result = TypeDescription.createDate();
break;
case DECIMAL: {
result = TypeDescription.createDecimal();
if (type.hasScale()) {
result.withScale(type.getScale());
}
if (type.hasPrecision()) {
result.withPrecision(type.getPrecision());
}
}
break;
case LIST:
if (type.getSubtypesCount() != 1) {
throw new FileFormatException("LIST type should contain exactly " +
"one subtype but has " + type.getSubtypesCount());
}
result = TypeDescription.createList(
convertTypeFromProtobuf(types, type.getSubtypes(0)));
break;
case MAP:
if (type.getSubtypesCount() != 2) {
throw new FileFormatException("MAP type should contain exactly " +
"two subtypes but has " + type.getSubtypesCount());
}
result = TypeDescription.createMap(
convertTypeFromProtobuf(types, type.getSubtypes(0)),
convertTypeFromProtobuf(types, type.getSubtypes(1)));
break;
case STRUCT: {
result = TypeDescription.createStruct();
for(int f=0; f < type.getSubtypesCount(); ++f) {
result.addField(type.getFieldNames(f),
convertTypeFromProtobuf(types, type.getSubtypes(f)));
}
}
break;
case UNION: {
if (type.getSubtypesCount() == 0) {
throw new FileFormatException("UNION type should contain at least" +
" one subtype but has none");
}
result = TypeDescription.createUnion();
for(int f=0; f < type.getSubtypesCount(); ++f) {
result.addUnionChild(
convertTypeFromProtobuf(types, type.getSubtypes(f)));
}
}
break;
default:
throw new IllegalArgumentException("Unknown ORC type " + type.getKind());
}
for(int i = 0; i < type.getAttributesCount(); ++i) {
OrcProto.StringPair pair = type.getAttributes(i);
result.setAttribute(pair.getKey(), pair.getValue());
}
return result;
}
public static List<StripeInformation> convertProtoStripesToStripes(
List<OrcProto.StripeInformation> stripes) {
List<StripeInformation> result = new ArrayList<>(stripes.size());
long previousStripeId = 0;
byte[][] previousKeys = null;
long stripeId = 0;
for (OrcProto.StripeInformation stripeProto: stripes) {
ReaderImpl.StripeInformationImpl stripe =
new ReaderImpl.StripeInformationImpl(stripeProto, stripeId++,
previousStripeId, previousKeys);
result.add(stripe);
previousStripeId = stripe.getEncryptionStripeId();
previousKeys = stripe.getEncryptedLocalKeys();
}
return result;
}
}