blob: a89d4353583d6d50fad32b9ad4f1e8084f0412fa [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.drill.metastore.util;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.expression.PathSegment;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.metadata.ColumnMetadata;
import org.apache.drill.exec.record.metadata.DictColumnMetadata;
import org.apache.drill.exec.record.metadata.MetadataUtils;
import org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class SchemaPathUtils {
private SchemaPathUtils() {
* Returns {@link ColumnMetadata} instance obtained from specified {@code TupleMetadata schema} which corresponds to
* the specified column schema path.
* @param schemaPath schema path of the column which should be obtained
* @param schema tuple schema where column should be searched
* @return {@link ColumnMetadata} instance which corresponds to the specified column schema path
public static ColumnMetadata getColumnMetadata(SchemaPath schemaPath, TupleMetadata schema) {
PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
while (!colPath.isLastPath() && colMetadata != null) {
if (colMetadata.isDict()) {
colMetadata = ((DictColumnMetadata) colMetadata).valueColumnMetadata();
if (!colMetadata.isMap()) {
colMetadata = null;
colPath = (PathSegment.NameSegment) colPath.getChild();
colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
return colMetadata;
* Checks if field identified by the schema path is child in either {@code DICT} or {@code REPEATED MAP}.
* For such fields, nested in {@code DICT} or {@code REPEATED MAP},
* filters can't be removed based on Parquet statistics.
* <p>The need for the check arises because statistics data is not obtained for such fields as their representation
* differs from the 'canonical' one. For example, field {@code `a`} in Parquet's {@code STRUCT ARRAY} is represented
* as {@code `struct_array`.`bag`.`array_element`.`a`} but once it is used in a filter, {@code ... WHERE struct_array[0].a = 1},
* it has different representation (with indexes stripped): {@code `struct_array`.`a`} which is not present in statistics.
* The same happens with DICT's {@code value}: for {@code SELECT ... WHERE dict_col['a'] = 0}, statistics exist for
* {@code `dict_col`.`key_value`.`value`} but the field in filter is translated to {@code `dict_col`.`a`} and hence it is
* considered not present in statistics. If the fields (such as ones shown in examples) are {@code OPTIONAL INT} then
* the field is considered not present in a table and is treated as {@code NULL}. To avoid this situation, the method is used.</p>
* @param schemaPath schema path used in filter
* @param schema schema containing all the fields in the file
* @return {@literal true} if field is nested inside {@code DICT} (is {@code `key`} or {@code `value`})
* or inside {@code REPEATED MAP} field, {@literal false} otherwise.
public static boolean isFieldNestedInDictOrRepeatedMap(SchemaPath schemaPath, TupleMetadata schema) {
PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
while (!colPath.isLastPath() && colMetadata != null) {
if (colMetadata.isDict() || (colMetadata.isMap() && Types.isRepeated(colMetadata.majorType()))) {
return true;
} else if (!colMetadata.isMap()) {
colPath = (PathSegment.NameSegment) colPath.getChild();
colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
return false;
* Adds column with specified schema path and type into specified {@code TupleMetadata schema}.
* For the case when specified {@link SchemaPath} has children, corresponding maps will be created
* in the {@code TupleMetadata schema} and the last child of the map will have specified type.
* @param schema tuple schema where column should be added
* @param schemaPath schema path of the column which should be added
* @param type type of the column which should be added
* @param types list of column's parent types
public static void addColumnMetadata(TupleMetadata schema, SchemaPath schemaPath,
TypeProtos.MajorType type, Map<SchemaPath, TypeProtos.MajorType> types) {
PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
List<String> names = new ArrayList<>(types.size());
// Used in case of LIST; defined here to avoid many instantiations inside while-loop
List<String> nextNames = new ArrayList<>(names.size());
ColumnMetadata colMetadata;
while (!colPath.isLastPath()) {
colMetadata = schema.metadata(colPath.getPath());
TypeProtos.MajorType pathType = types.get(SchemaPath.getCompoundPath(names.toArray(new String[0])));
// The following types, DICT and LIST, contain a nested segment in Parquet representation
// (see ParquetReaderUtility#isLogicalListType(GroupType) and ParquetReaderUtility#isLogicalMapType(GroupType))
// which we should skip when creating corresponding TupleMetadata representation. Additionally,
// there is a need to track if the field is LIST to create appropriate column metadata based
// on the info: whether to create singular MAP/DICT or MAP/DICT array.
boolean isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
boolean isList = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.LIST;
String name = colPath.getPath();
if (isList) {
// Parquet's LIST group (which represents an array) has
// an inner group (bagSegment) which we want to skip here
PathSegment.NameSegment bagSegment = colPath.getChild().getNameSegment();
PathSegment.NameSegment elementSegment = bagSegment.getChild().getNameSegment();
pathType = types.get(SchemaPath.getCompoundPath(nextNames.toArray(new String[0])));
if (pathType == null && colPath.getChild().getChild().isLastPath()) {
// The list is actually a repeated primitive:
// will be handled after the while statement
colPath = elementSegment;
// Check whether LIST's element type is DICT
isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
if (colMetadata == null) {
if (isDict) {
colMetadata = isList ? MetadataUtils.newDictArray(name) : MetadataUtils.newDict(name);
} else {
colMetadata = isList ? MetadataUtils.newMapArray(name, null) : MetadataUtils.newMap(name, null);
if (isDict) {
// Parquet's MAP (which corresponds to DICT in Drill) has
// an inner group which we want to skip here
colPath = (PathSegment.NameSegment) colPath.getChild();
if (!colMetadata.isMap() && !colMetadata.isDict()) {
throw new DrillRuntimeException(String.format("Expected map or dict, but was %s", colMetadata.majorType()));
schema = colMetadata.tupleSchema();
colPath = (PathSegment.NameSegment) colPath.getChild();
colMetadata = schema.metadata(colPath.getPath());
if (colMetadata == null) {
schema.addColumn(new PrimitiveColumnMetadata(MaterializedField.create(colPath.getPath(), type)));
} else if (!colMetadata.majorType().equals(type)) {
throw new DrillRuntimeException(String.format("Types mismatch: existing type: %s, new type: %s", colMetadata.majorType(), type));