blob: 62b3b56c38c669fd5d4b75101b5508bf75f4b9ec [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.parquet.columnreaders;
import java.util.Map;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.physical.impl.OutputMutator;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.store.parquet.ParquetReaderUtility;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.RepeatedValueVector;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.format.SchemaElement;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
/**
* Represents a single column read from the Parquet file by the record reader.
*/
public class ParquetColumnMetadata {
ColumnDescriptor column;
private SchemaElement se;
MaterializedField field;
int length;
private MajorType type;
ColumnChunkMetaData columnChunkMetaData;
private ValueVector vector;
public ParquetColumnMetadata(ColumnDescriptor column) {
this.column = column;
}
public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) {
se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column));
type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(),
getDataMode(column), se, options);
field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type);
length = getDataTypeLength();
}
private SchemaPath toFieldName(String[] paths) {
return SchemaPath.getCompoundPath(paths);
}
private TypeProtos.DataMode getDataMode(ColumnDescriptor column) {
if (isRepeated()) {
return DataMode.REPEATED;
} else if (column.getMaxDefinitionLevel() == 0) {
return TypeProtos.DataMode.REQUIRED;
} else {
return TypeProtos.DataMode.OPTIONAL;
}
}
/**
* @param type
* @param type a fixed length type from the parquet library enum
* @return the length in pageDataByteArray of the type
*/
public static int getTypeLengthInBits(PrimitiveTypeName type) {
switch (type) {
case INT64: return 64;
case INT32: return 32;
case BOOLEAN: return 1;
case FLOAT: return 32;
case DOUBLE: return 64;
case INT96: return 96;
// binary and fixed length byte array
default:
throw new IllegalStateException("Length cannot be determined for type " + type);
}
}
public static final int UNDEFINED_LENGTH = -1;
/**
* Returns data type length for a given {@see ColumnDescriptor} and it's corresponding
* {@see SchemaElement}. Neither is enough information alone as the max
* repetition level (indicating if it is an array type) is in the ColumnDescriptor and
* the length of a fixed width field is stored at the schema level.
*
* @return the length if fixed width, else <tt>UNDEFINED_LENGTH</tt> (-1)
*/
public int getDataTypeLength() {
if (! isFixedLength()) {
return UNDEFINED_LENGTH;
} else if (isRepeated()) {
return UNDEFINED_LENGTH;
} else if (column.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
return se.getType_length() * 8;
} else {
return getTypeLengthInBits(column.getType());
}
}
public boolean isFixedLength( ) {
return column.getType() != PrimitiveType.PrimitiveTypeName.BINARY;
}
public boolean isRepeated() {
return column.getMaxRepetitionLevel() > 0;
}
public MaterializedField getField() {
return field;
}
ValueVector buildVector(OutputMutator output) throws SchemaChangeException {
Class<? extends ValueVector> vectorClass = TypeHelper.getValueVectorClass(type.getMinorType(), type.getMode());
vector = output.addField(field, vectorClass);
return vector;
}
ColumnReader<?> makeFixedWidthReader(ParquetRecordReader reader) throws Exception {
return ColumnReaderFactory.createFixedColumnReader(reader, true,
column, columnChunkMetaData, vector, se);
}
FixedWidthRepeatedReader makeRepeatedFixedWidthReader(ParquetRecordReader reader) throws Exception {
final RepeatedValueVector repeatedVector = RepeatedValueVector.class.cast(vector);
ColumnReader<?> dataReader = ColumnReaderFactory.createFixedColumnReader(reader, true,
column, columnChunkMetaData,
repeatedVector.getDataVector(), se);
return new FixedWidthRepeatedReader(reader, dataReader,
getTypeLengthInBits(column.getType()), column, columnChunkMetaData, false, repeatedVector, se);
}
VarLengthValuesColumn<?> makeVariableWidthReader(ParquetRecordReader reader) throws ExecutionSetupException {
return ColumnReaderFactory.getReader(reader, column, columnChunkMetaData, false, vector, se);
}
}