metastore/metastore-api/src/main/java/org/apache/drill/metastore/util/SchemaPathUtils.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.metastore.util;

 import org.apache.drill.common.exceptions.DrillRuntimeException;
 import org.apache.drill.common.expression.PathSegment;
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.types.TypeProtos;
 import org.apache.drill.common.types.Types;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.record.metadata.ColumnMetadata;
 import org.apache.drill.exec.record.metadata.DictColumnMetadata;
 import org.apache.drill.exec.record.metadata.MetadataUtils;
 import org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata;
 import org.apache.drill.exec.record.metadata.TupleMetadata;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;

 public class SchemaPathUtils {

   private SchemaPathUtils() {
   }

   /**
    * Returns {@link ColumnMetadata} instance obtained from specified {@code TupleMetadata schema} which corresponds to
    * the specified column schema path.
    *
    * @param schemaPath schema path of the column which should be obtained
    * @param schema     tuple schema where column should be searched
    * @return {@link ColumnMetadata} instance which corresponds to the specified column schema path
    */
   public static ColumnMetadata getColumnMetadata(SchemaPath schemaPath, TupleMetadata schema) {
     PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
     ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
     while (!colPath.isLastPath() && colMetadata != null) {
       if (colMetadata.isDict()) {
         colMetadata = ((DictColumnMetadata) colMetadata).valueColumnMetadata();
         break;
       }
       if (!colMetadata.isMap()) {
         colMetadata = null;
         break;
       }
       colPath = (PathSegment.NameSegment) colPath.getChild();
       colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
     }
     return colMetadata;
   }

   /**
    * Checks if field identified by the schema path is child in either {@code DICT} or {@code REPEATED MAP}.
    * For such fields, nested in {@code DICT} or {@code REPEATED MAP},
    * filters can't be removed based on Parquet statistics.
    *
    * <p>The need for the check arises because statistics data is not obtained for such fields as their representation
    * differs from the 'canonical' one. For example, field {@code `a`} in Parquet's {@code STRUCT ARRAY} is represented
    * as {@code `struct_array`.`bag`.`array_element`.`a`} but once it is used in a filter, {@code ... WHERE struct_array[0].a = 1},
    * it has different representation (with indexes stripped): {@code `struct_array`.`a`} which is not present in statistics.
    * The same happens with DICT's {@code value}: for {@code SELECT ... WHERE dict_col['a'] = 0}, statistics exist for
    * {@code `dict_col`.`key_value`.`value`} but the field in filter is translated to {@code `dict_col`.`a`} and hence it is
    * considered not present in statistics. If the fields (such as ones shown in examples) are {@code OPTIONAL INT} then
    * the field is considered not present in a table and is treated as {@code NULL}. To avoid this situation, the method is used.</p>
    *
    * @param schemaPath schema path used in filter
    * @param schema schema containing all the fields in the file
    * @return {@literal true} if field is nested inside {@code DICT} (is {@code `key`} or {@code `value`})
    *         or inside {@code REPEATED MAP} field, {@literal false} otherwise.
    */
   public static boolean isFieldNestedInDictOrRepeatedMap(SchemaPath schemaPath, TupleMetadata schema) {
     PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
     ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
     while (!colPath.isLastPath() && colMetadata != null) {
       if (colMetadata.isDict() || (colMetadata.isMap() && Types.isRepeated(colMetadata.majorType()))) {
         return true;
       } else if (!colMetadata.isMap()) {
         break;
       }
       colPath = (PathSegment.NameSegment) colPath.getChild();
       colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
     }
     return false;
   }

   /**
    * Adds column with specified schema path and type into specified {@code TupleMetadata schema}.
    * For the case when specified {@link SchemaPath} has children, corresponding maps will be created
    * in the {@code TupleMetadata schema} and the last child of the map will have specified type.
    *
    * @param schema     tuple schema where column should be added
    * @param schemaPath schema path of the column which should be added
    * @param type       type of the column which should be added
    * @param types      list of column's parent types
    */
   public static void addColumnMetadata(TupleMetadata schema, SchemaPath schemaPath,
         TypeProtos.MajorType type, Map<SchemaPath, TypeProtos.MajorType> types) {
     PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
     List<String> names = new ArrayList<>(types.size());
     // Used in case of LIST; defined here to avoid many instantiations inside while-loop
     List<String> nextNames = new ArrayList<>(names.size());
     ColumnMetadata colMetadata;
     while (!colPath.isLastPath()) {
       names.add(colPath.getPath());
       colMetadata = schema.metadata(colPath.getPath());
       TypeProtos.MajorType pathType = types.get(SchemaPath.getCompoundPath(names.toArray(new String[0])));

       // The following types, DICT and LIST, contain a nested segment in Parquet representation
       // (see ParquetReaderUtility#isLogicalListType(GroupType) and ParquetReaderUtility#isLogicalMapType(GroupType))
       // which we should skip when creating corresponding TupleMetadata representation. Additionally,
       // there is a need to track if the field is LIST to create appropriate column metadata based
       // on the info: whether to create singular MAP/DICT or MAP/DICT array.
       boolean isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
       boolean isList = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.LIST;
       String name = colPath.getPath();

       if (isList) {
         nextNames.clear();
         nextNames.addAll(names);

         // Parquet's LIST group (which represents an array) has
         // an inner group (bagSegment) which we want to skip here
         PathSegment.NameSegment bagSegment = colPath.getChild().getNameSegment();
         PathSegment.NameSegment elementSegment = bagSegment.getChild().getNameSegment();
         nextNames.add(bagSegment.getPath());
         nextNames.add(elementSegment.getPath());

         pathType = types.get(SchemaPath.getCompoundPath(nextNames.toArray(new String[0])));

         if (pathType == null && colPath.getChild().getChild().isLastPath()) {
           // The list is actually a repeated primitive:
           // will be handled after the while statement
           break;
         }

         colPath = elementSegment;

         names.add(bagSegment.getPath());
         names.add(elementSegment.getPath());

         // Check whether LIST's element type is DICT
         isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
       }

       if (colMetadata == null) {
         if (isDict) {
           colMetadata = isList ? MetadataUtils.newDictArray(name) : MetadataUtils.newDict(name);
         } else {
           colMetadata = isList ? MetadataUtils.newMapArray(name, null) : MetadataUtils.newMap(name, null);
         }
         schema.addColumn(colMetadata);
       }

       if (isDict) {
         // Parquet's MAP (which corresponds to DICT in Drill) has
         // an inner group which we want to skip here
         colPath = (PathSegment.NameSegment) colPath.getChild();
         names.add(colPath.getPath());
       }

       if (!colMetadata.isMap() && !colMetadata.isDict()) {
         throw new DrillRuntimeException(String.format("Expected map or dict, but was %s", colMetadata.majorType()));
       }

       schema = colMetadata.tupleSchema();
       colPath = (PathSegment.NameSegment) colPath.getChild();
     }

     colMetadata = schema.metadata(colPath.getPath());
     if (colMetadata == null) {
       schema.addColumn(new PrimitiveColumnMetadata(MaterializedField.create(colPath.getPath(), type)));
     } else if (!colMetadata.majorType().equals(type)) {
       throw new DrillRuntimeException(String.format("Types mismatch: existing type: %s, new type: %s", colMetadata.majorType(), type));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.metastore.util;

	import org.apache.drill.common.exceptions.DrillRuntimeException;
	import org.apache.drill.common.expression.PathSegment;
	import org.apache.drill.common.expression.SchemaPath;
	import org.apache.drill.common.types.TypeProtos;
	import org.apache.drill.common.types.Types;
	import org.apache.drill.exec.record.MaterializedField;
	import org.apache.drill.exec.record.metadata.ColumnMetadata;
	import org.apache.drill.exec.record.metadata.DictColumnMetadata;
	import org.apache.drill.exec.record.metadata.MetadataUtils;
	import org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata;
	import org.apache.drill.exec.record.metadata.TupleMetadata;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	public class SchemaPathUtils {

	private SchemaPathUtils() {
	}

	/**
	* Returns {@link ColumnMetadata} instance obtained from specified {@code TupleMetadata schema} which corresponds to
	* the specified column schema path.
	*
	* @param schemaPath schema path of the column which should be obtained
	* @param schema tuple schema where column should be searched
	* @return {@link ColumnMetadata} instance which corresponds to the specified column schema path
	*/
	public static ColumnMetadata getColumnMetadata(SchemaPath schemaPath, TupleMetadata schema) {
	PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
	ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
	while (!colPath.isLastPath() && colMetadata != null) {
	if (colMetadata.isDict()) {
	colMetadata = ((DictColumnMetadata) colMetadata).valueColumnMetadata();
	break;
	}
	if (!colMetadata.isMap()) {
	colMetadata = null;
	break;
	}
	colPath = (PathSegment.NameSegment) colPath.getChild();
	colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
	}
	return colMetadata;
	}

	/**
	* Checks if field identified by the schema path is child in either {@code DICT} or {@code REPEATED MAP}.
	* For such fields, nested in {@code DICT} or {@code REPEATED MAP},
	* filters can't be removed based on Parquet statistics.
	*
	* <p>The need for the check arises because statistics data is not obtained for such fields as their representation
	* differs from the 'canonical' one. For example, field {@code `a`} in Parquet's {@code STRUCT ARRAY} is represented
	* as {@code `struct_array`.`bag`.`array_element`.`a`} but once it is used in a filter, {@code ... WHERE struct_array[0].a = 1},
	* it has different representation (with indexes stripped): {@code `struct_array`.`a`} which is not present in statistics.
	* The same happens with DICT's {@code value}: for {@code SELECT ... WHERE dict_col['a'] = 0}, statistics exist for
	* {@code `dict_col`.`key_value`.`value`} but the field in filter is translated to {@code `dict_col`.`a`} and hence it is
	* considered not present in statistics. If the fields (such as ones shown in examples) are {@code OPTIONAL INT} then
	* the field is considered not present in a table and is treated as {@code NULL}. To avoid this situation, the method is used.</p>
	*
	* @param schemaPath schema path used in filter
	* @param schema schema containing all the fields in the file
	* @return {@literal true} if field is nested inside {@code DICT} (is {@code `key`} or {@code `value`})
	* or inside {@code REPEATED MAP} field, {@literal false} otherwise.
	*/
	public static boolean isFieldNestedInDictOrRepeatedMap(SchemaPath schemaPath, TupleMetadata schema) {
	PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
	ColumnMetadata colMetadata = schema.metadata(colPath.getPath());
	while (!colPath.isLastPath() && colMetadata != null) {
	if (colMetadata.isDict() \|\| (colMetadata.isMap() && Types.isRepeated(colMetadata.majorType()))) {
	return true;
	} else if (!colMetadata.isMap()) {
	break;
	}
	colPath = (PathSegment.NameSegment) colPath.getChild();
	colMetadata = colMetadata.tupleSchema().metadata(colPath.getPath());
	}
	return false;
	}

	/**
	* Adds column with specified schema path and type into specified {@code TupleMetadata schema}.
	* For the case when specified {@link SchemaPath} has children, corresponding maps will be created
	* in the {@code TupleMetadata schema} and the last child of the map will have specified type.
	*
	* @param schema tuple schema where column should be added
	* @param schemaPath schema path of the column which should be added
	* @param type type of the column which should be added
	* @param types list of column's parent types
	*/
	public static void addColumnMetadata(TupleMetadata schema, SchemaPath schemaPath,
	TypeProtos.MajorType type, Map<SchemaPath, TypeProtos.MajorType> types) {
	PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
	List<String> names = new ArrayList<>(types.size());
	// Used in case of LIST; defined here to avoid many instantiations inside while-loop
	List<String> nextNames = new ArrayList<>(names.size());
	ColumnMetadata colMetadata;
	while (!colPath.isLastPath()) {
	names.add(colPath.getPath());
	colMetadata = schema.metadata(colPath.getPath());
	TypeProtos.MajorType pathType = types.get(SchemaPath.getCompoundPath(names.toArray(new String[0])));

	// The following types, DICT and LIST, contain a nested segment in Parquet representation
	// (see ParquetReaderUtility#isLogicalListType(GroupType) and ParquetReaderUtility#isLogicalMapType(GroupType))
	// which we should skip when creating corresponding TupleMetadata representation. Additionally,
	// there is a need to track if the field is LIST to create appropriate column metadata based
	// on the info: whether to create singular MAP/DICT or MAP/DICT array.
	boolean isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
	boolean isList = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.LIST;
	String name = colPath.getPath();

	if (isList) {
	nextNames.clear();
	nextNames.addAll(names);

	// Parquet's LIST group (which represents an array) has
	// an inner group (bagSegment) which we want to skip here
	PathSegment.NameSegment bagSegment = colPath.getChild().getNameSegment();
	PathSegment.NameSegment elementSegment = bagSegment.getChild().getNameSegment();
	nextNames.add(bagSegment.getPath());
	nextNames.add(elementSegment.getPath());

	pathType = types.get(SchemaPath.getCompoundPath(nextNames.toArray(new String[0])));

	if (pathType == null && colPath.getChild().getChild().isLastPath()) {
	// The list is actually a repeated primitive:
	// will be handled after the while statement
	break;
	}

	colPath = elementSegment;

	names.add(bagSegment.getPath());
	names.add(elementSegment.getPath());

	// Check whether LIST's element type is DICT
	isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
	}

	if (colMetadata == null) {
	if (isDict) {
	colMetadata = isList ? MetadataUtils.newDictArray(name) : MetadataUtils.newDict(name);
	} else {
	colMetadata = isList ? MetadataUtils.newMapArray(name, null) : MetadataUtils.newMap(name, null);
	}
	schema.addColumn(colMetadata);
	}

	if (isDict) {
	// Parquet's MAP (which corresponds to DICT in Drill) has
	// an inner group which we want to skip here
	colPath = (PathSegment.NameSegment) colPath.getChild();
	names.add(colPath.getPath());
	}

	if (!colMetadata.isMap() && !colMetadata.isDict()) {
	throw new DrillRuntimeException(String.format("Expected map or dict, but was %s", colMetadata.majorType()));
	}

	schema = colMetadata.tupleSchema();
	colPath = (PathSegment.NameSegment) colPath.getChild();
	}

	colMetadata = schema.metadata(colPath.getPath());
	if (colMetadata == null) {
	schema.addColumn(new PrimitiveColumnMetadata(MaterializedField.create(colPath.getPath(), type)));
	} else if (!colMetadata.majorType().equals(type)) {
	throw new DrillRuntimeException(String.format("Types mismatch: existing type: %s, new type: %s", colMetadata.majorType(), type));
	}
	}
	}