exec/vector/src/main/java/org/apache/drill/exec/record/metadata/ColumnMetadata.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.record.metadata;

 import java.time.format.DateTimeFormatter;

 import org.apache.drill.common.types.TypeProtos.DataMode;
 import org.apache.drill.common.types.TypeProtos.MajorType;
 import org.apache.drill.common.types.TypeProtos.MinorType;
 import org.apache.drill.exec.record.MaterializedField;
 import org.apache.drill.exec.vector.accessor.ColumnWriter;

 /**
  * Metadata description of a column including names, types and structure
  * information.
  */
 public interface ColumnMetadata extends Propertied {

   /**
    * Predicted number of elements per array entry. Default is
    * taken from the often hard-coded value of 10.
    */
   String EXPECTED_CARDINALITY_PROP = DRILL_PROP_PREFIX + "cardinality";

   /**
    * Default value represented as a string.
    */
   String DEFAULT_VALUE_PROP = DRILL_PROP_PREFIX + "default";

   /**
    * Expected (average) width for variable-width columns.
    */
   String EXPECTED_WIDTH_PROP = DRILL_PROP_PREFIX + "width";

   /**
    * Optional format to use when converting to/from string values.
    */
   String FORMAT_PROP = DRILL_PROP_PREFIX + "format";

   /**
    * Indicates how to handle blanks. Must be one of the valid values defined
    * in AbstractConvertFromString. Normally set on the converter by the plugin
    * rather than by the user in the schema.
    */
   String BLANK_AS_PROP = DRILL_PROP_PREFIX + "blank-as";

   /**
    * Convert blanks to null values (if the column is nullable), or
    * fill with the default value (non-nullable.)
    */
   String BLANK_AS_NULL = "null";

   /**
    * Convert blanks for numeric fields to 0. For non-numeric
    * fields, convert to null (for nullable) or the default value
    * (for non-nullable). Works best if non-numeric fields are declared
    * as nullable.
    */
   String BLANK_AS_ZERO = "0";

   /**
    * Indicates whether to project the column in a wildcard (*) query.
    * Special columns may be excluded from projection. Certain "special"
    * columns may be available only when explicitly requested. For example,
    * the log reader has a "_raw" column which includes the entire input
    * line before parsing. This column can be requested explicitly:<br>
    * {@code SELECT foo, bar, _raw FROM ...}<br>
    * but the column will <i>not</i> be included when using the wildcard:<br>
    * {@code SELECT * FROM ...}
    * <p>
    * Marking a column (either in the provided schema or the reader schema)
    * will prevent that column from appearing in a wildcard expansion.
    */
   String EXCLUDE_FROM_WILDCARD = DRILL_PROP_PREFIX + "special";

   int DEFAULT_ARRAY_SIZE = 10;

   /**
    * Indicates that a provided schema column is an implicit column
    * (one defined by Drill rather than the reader.) Allows the implicit
    * schema to reify partition names, say, as reader-specific names.
    * For example, {@code dir0} might be reified as {@code year}, etc.
    * <p>
    * Available when the underlying reader supports implicit columns.
    * The value is the defined implicit column name (not the name
    * set via system/session options.) Using the defined name makes
    * the provided schema immune from runtime changes to column names.
    * <p>
    * As the result of adding this feature, any column <i>not</i>
    * tagged as implicit is a reader column, even if that column
    * happens to have the same (currently selected runtime) name
    * as an implicit column.
    */
   String IMPLICIT_COL_TYPE = DRILL_PROP_PREFIX + "implicit";

   /**
    * Fully-qualified name implicit column type.
    */
   String IMPLICIT_FQN = "fqn";

   /**
    * File path implicit column type.
    */
   String IMPLICIT_FILEPATH = "filepath";

   /**
    * File name implicit column type.
    */
   String IMPLICIT_FILENAME = "filename";

   /**
    * File suffix implicit column type.
    */
   String IMPLICIT_SUFFIX = "suffix";

   /**
    * Prefix for partition directories. dir0 is the table root
    * folder, dir1 the first subdirectory, and so on. Directories that
    * don't exist in the actual file path take a {@code NULL} value.
    */
   String IMPLICIT_PARTITION_PREFIX = "dir";

   /**
    * Rough characterization of Drill types into metadata categories.
    * Various aspects of Drill's type system are very, very messy.
    * However, Drill is defined by its code, not some abstract design,
    * so the metadata system here does the best job it can to simplify
    * the messy type system while staying close to the underlying
    * implementation.
    */
   enum StructureType {

     /**
      * Primitive column (all types except List, Map and Union.)
      * Includes (one-dimensional) arrays of those types.
      */
     PRIMITIVE,

     /**
      * Map or repeated map. Also describes the row as a whole.
      */
     TUPLE,

     /**
      * Union or (non-repeated) list. (A non-repeated list is,
      * essentially, a repeated union.)
      */
     VARIANT,

     /**
      * A repeated list. A repeated list is not simply the repeated
      * form of a list, it is something else entirely. It acts as
      * a dimensional wrapper around any other type (except list)
      * and adds a non-nullable extra dimension. Hence, this type is
      * for 2D+ arrays.
      * <p>
      * In theory, a 2D list of, say, INT would be an INT column, but
      * repeated in to dimensions. Alas, that is not how it is. Also,
      * if we have a separate category for 2D lists, we should have
      * a separate category for 1D lists. But, again, that is not how
      * the code has evolved.
      */
     MULTI_ARRAY,

     /**
      * Dict or repeated dict.
      */
     DICT,

     /**
      * Unknown, specified at runtime. (Only for logical columns,
      * not for physical columns.)
      */
     DYNAMIC
   }

   StructureType structureType();

   /**
    * Schema for {@code TUPLE} columns.
    *
    * @return the tuple schema
    */
   TupleMetadata tupleSchema();

   /**
    * Schema for {@code VARIANT} columns.
    *
    * @return the variant schema
    */
   VariantMetadata variantSchema();

   /**
    * Schema of inner dimension for <code>MULTI_ARRAY</code> columns.
    * If an array is 3D, the outer column represents all 3 dimensions.
    * {@code outer.childSchema()} gives another {@code MULTI_ARRAY}
    * for the inner 2D array.
    * {@code outer.childSchema().childSchema()} gives a column
    * of some other type (but repeated) for the 1D array.
    * <p>
    * Sorry for the mess, but it is how the code works and we are not
    * in a position to revisit data type fundamentals.
    *
    * @return the description of the (n-1) st dimension.
    */
   ColumnMetadata childSchema();
   MaterializedField schema();
   MaterializedField emptySchema();
   String name();
   MinorType type();
   MajorType majorType();
   DataMode mode();
   int dimensions();
   boolean isNullable();
   boolean isArray();
   boolean isVariableWidth();
   boolean isMap();
   boolean isVariant();
   boolean isDict();
   boolean isScalar();

   /**
    * Reports if the column is dynamic. A dynamic column is one with
    * a "type to be named later." It is valid for describing a dynamic
    * schema, but not for creating vectors; to create a vector the
    * column must be resolved to a concrete type. The context should
    * make it clear if any columns can be dynamic.
    * @return {@code true} if the column does not yet have a concrete
    * type, {@code false} if the column type is concrete
    */
   boolean isDynamic();

   /**
    * Determine if the schema represents a column with a LIST type with
    * UNION elements. (Lists can be of a single
    * type (with nullable elements) or can be of unions.)
    *
    * @return true if the column is of type LIST of UNIONs
    */
   boolean isMultiList();

   /**
    * Report whether one column is equivalent to another. Columns are equivalent
    * if they have the same name, type and structure (ignoring internal structure
    * such as properties.)
    */
   boolean isEquivalent(ColumnMetadata other);

   /**
    * For variable-width columns, specify the expected column width to be used
    * when allocating a new vector. Does nothing for fixed-width columns.
    *
    * @param width the expected column width
    */
   void setExpectedWidth(int width);

   /**
    * Get the expected width for a column. This is the actual width for fixed-
    * width columns, the specified width (defaulting to 50) for variable-width
    * columns.
    * @return the expected column width of the each data value. Does not include
    * "overhead" space such as for the null-value vector or offset vector
    */
   int expectedWidth();

   /**
    * For an array column, specify the expected average array cardinality.
    * Ignored for non-array columns. Used when allocating new vectors.
    *
    * @param childCount the expected average array cardinality. Defaults to
    * 1 for non-array columns, 10 for array columns
    */
   void setExpectedElementCount(int childCount);

   /**
    * Returns the expected array cardinality for array columns, or 1 for
    * non-array columns.
    *
    * @return the expected value cardinality per value (per-row for top-level
    * columns, per array element for arrays within lists)
    */
   int expectedElementCount();

   void setFormat(String value);

   String format();

   /**
    * Returns the formatter to use for date/time values. Only valid for
    * date/time columns.
    *
    * @return
    */
   DateTimeFormatter dateTimeFormatter();

   /**
    * Sets the default value property using the string-encoded form of the value.
    * The default value is used for filling a vector when no real data is available.
    *
    * @param value the default value in String representation
    */
   void setDefaultValue(String value);

   /**
    * Returns the default value for this column in String literal representation.
    *
    * @return the default value in String literal representation, or null if no
    * default value has been set
    */
   String defaultValue();

   /**
    * Returns the default value decoded into object form. This is the same as:
    * <pre><code>decodeValue(defaultValue());
    * </code></pre>
    *
    * @return the default value decode as an object that can be passed to
    * the {@link ColumnWriter#setObject()} method.
    */
   Object decodeDefaultValue();

   String valueToString(Object value);
   Object valueFromString(String value);

   /**
    * Create an empty version of this column. If the column is a scalar,
    * produces a simple copy. If a map, produces a clone without child
    * columns.
    *
    * @return empty clone of this column
    */
   ColumnMetadata cloneEmpty();

   int precision();
   int scale();

   void bind(TupleMetadata parentTuple);

   ColumnMetadata copy();

   /**
    * Converts type metadata into string representation
    * accepted by the table schema parser.
    *
    * @return type metadata string representation
    */
   String typeString();

   /**
    * Converts column metadata into string representation
    * accepted by the table schema parser.
    *
    * @return column metadata string representation
    */
   String columnString();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.record.metadata;

	import java.time.format.DateTimeFormatter;

	import org.apache.drill.common.types.TypeProtos.DataMode;
	import org.apache.drill.common.types.TypeProtos.MajorType;
	import org.apache.drill.common.types.TypeProtos.MinorType;
	import org.apache.drill.exec.record.MaterializedField;
	import org.apache.drill.exec.vector.accessor.ColumnWriter;

	/**
	* Metadata description of a column including names, types and structure
	* information.
	*/
	public interface ColumnMetadata extends Propertied {

	/**
	* Predicted number of elements per array entry. Default is
	* taken from the often hard-coded value of 10.
	*/
	String EXPECTED_CARDINALITY_PROP = DRILL_PROP_PREFIX + "cardinality";

	/**
	* Default value represented as a string.
	*/
	String DEFAULT_VALUE_PROP = DRILL_PROP_PREFIX + "default";

	/**
	* Expected (average) width for variable-width columns.
	*/
	String EXPECTED_WIDTH_PROP = DRILL_PROP_PREFIX + "width";

	/**
	* Optional format to use when converting to/from string values.
	*/
	String FORMAT_PROP = DRILL_PROP_PREFIX + "format";

	/**
	* Indicates how to handle blanks. Must be one of the valid values defined
	* in AbstractConvertFromString. Normally set on the converter by the plugin
	* rather than by the user in the schema.
	*/
	String BLANK_AS_PROP = DRILL_PROP_PREFIX + "blank-as";

	/**
	* Convert blanks to null values (if the column is nullable), or
	* fill with the default value (non-nullable.)
	*/
	String BLANK_AS_NULL = "null";

	/**
	* Convert blanks for numeric fields to 0. For non-numeric
	* fields, convert to null (for nullable) or the default value
	* (for non-nullable). Works best if non-numeric fields are declared
	* as nullable.
	*/
	String BLANK_AS_ZERO = "0";

	/**
	* Indicates whether to project the column in a wildcard (*) query.
	* Special columns may be excluded from projection. Certain "special"
	* columns may be available only when explicitly requested. For example,
	* the log reader has a "_raw" column which includes the entire input
	* line before parsing. This column can be requested explicitly:<br>
	* {@code SELECT foo, bar, _raw FROM ...}<br>
	* but the column will <i>not</i> be included when using the wildcard:<br>
	* {@code SELECT * FROM ...}
	* <p>
	* Marking a column (either in the provided schema or the reader schema)
	* will prevent that column from appearing in a wildcard expansion.
	*/
	String EXCLUDE_FROM_WILDCARD = DRILL_PROP_PREFIX + "special";

	int DEFAULT_ARRAY_SIZE = 10;

	/**
	* Indicates that a provided schema column is an implicit column
	* (one defined by Drill rather than the reader.) Allows the implicit
	* schema to reify partition names, say, as reader-specific names.
	* For example, {@code dir0} might be reified as {@code year}, etc.
	* <p>
	* Available when the underlying reader supports implicit columns.
	* The value is the defined implicit column name (not the name
	* set via system/session options.) Using the defined name makes
	* the provided schema immune from runtime changes to column names.
	* <p>
	* As the result of adding this feature, any column <i>not</i>
	* tagged as implicit is a reader column, even if that column
	* happens to have the same (currently selected runtime) name
	* as an implicit column.
	*/
	String IMPLICIT_COL_TYPE = DRILL_PROP_PREFIX + "implicit";

	/**
	* Fully-qualified name implicit column type.
	*/
	String IMPLICIT_FQN = "fqn";

	/**
	* File path implicit column type.
	*/
	String IMPLICIT_FILEPATH = "filepath";

	/**
	* File name implicit column type.
	*/
	String IMPLICIT_FILENAME = "filename";

	/**
	* File suffix implicit column type.
	*/
	String IMPLICIT_SUFFIX = "suffix";

	/**
	* Prefix for partition directories. dir0 is the table root
	* folder, dir1 the first subdirectory, and so on. Directories that
	* don't exist in the actual file path take a {@code NULL} value.
	*/
	String IMPLICIT_PARTITION_PREFIX = "dir";

	/**
	* Rough characterization of Drill types into metadata categories.
	* Various aspects of Drill's type system are very, very messy.
	* However, Drill is defined by its code, not some abstract design,
	* so the metadata system here does the best job it can to simplify
	* the messy type system while staying close to the underlying
	* implementation.
	*/
	enum StructureType {

	/**
	* Primitive column (all types except List, Map and Union.)
	* Includes (one-dimensional) arrays of those types.
	*/
	PRIMITIVE,

	/**
	* Map or repeated map. Also describes the row as a whole.
	*/
	TUPLE,

	/**
	* Union or (non-repeated) list. (A non-repeated list is,
	* essentially, a repeated union.)
	*/
	VARIANT,

	/**
	* A repeated list. A repeated list is not simply the repeated
	* form of a list, it is something else entirely. It acts as
	* a dimensional wrapper around any other type (except list)
	* and adds a non-nullable extra dimension. Hence, this type is
	* for 2D+ arrays.
	* <p>
	* In theory, a 2D list of, say, INT would be an INT column, but
	* repeated in to dimensions. Alas, that is not how it is. Also,
	* if we have a separate category for 2D lists, we should have
	* a separate category for 1D lists. But, again, that is not how
	* the code has evolved.
	*/
	MULTI_ARRAY,

	/**
	* Dict or repeated dict.
	*/
	DICT,

	/**
	* Unknown, specified at runtime. (Only for logical columns,
	* not for physical columns.)
	*/
	DYNAMIC
	}

	StructureType structureType();

	/**
	* Schema for {@code TUPLE} columns.
	*
	* @return the tuple schema
	*/
	TupleMetadata tupleSchema();

	/**
	* Schema for {@code VARIANT} columns.
	*
	* @return the variant schema
	*/
	VariantMetadata variantSchema();

	/**
	* Schema of inner dimension for <code>MULTI_ARRAY</code> columns.
	* If an array is 3D, the outer column represents all 3 dimensions.
	* {@code outer.childSchema()} gives another {@code MULTI_ARRAY}
	* for the inner 2D array.
	* {@code outer.childSchema().childSchema()} gives a column
	* of some other type (but repeated) for the 1D array.
	* <p>
	* Sorry for the mess, but it is how the code works and we are not
	* in a position to revisit data type fundamentals.
	*
	* @return the description of the (n-1) st dimension.
	*/
	ColumnMetadata childSchema();
	MaterializedField schema();
	MaterializedField emptySchema();
	String name();
	MinorType type();
	MajorType majorType();
	DataMode mode();
	int dimensions();
	boolean isNullable();
	boolean isArray();
	boolean isVariableWidth();
	boolean isMap();
	boolean isVariant();
	boolean isDict();
	boolean isScalar();

	/**
	* Reports if the column is dynamic. A dynamic column is one with
	* a "type to be named later." It is valid for describing a dynamic
	* schema, but not for creating vectors; to create a vector the
	* column must be resolved to a concrete type. The context should
	* make it clear if any columns can be dynamic.
	* @return {@code true} if the column does not yet have a concrete
	* type, {@code false} if the column type is concrete
	*/
	boolean isDynamic();

	/**
	* Determine if the schema represents a column with a LIST type with
	* UNION elements. (Lists can be of a single
	* type (with nullable elements) or can be of unions.)
	*
	* @return true if the column is of type LIST of UNIONs
	*/
	boolean isMultiList();

	/**
	* Report whether one column is equivalent to another. Columns are equivalent
	* if they have the same name, type and structure (ignoring internal structure
	* such as properties.)
	*/
	boolean isEquivalent(ColumnMetadata other);

	/**
	* For variable-width columns, specify the expected column width to be used
	* when allocating a new vector. Does nothing for fixed-width columns.
	*
	* @param width the expected column width
	*/
	void setExpectedWidth(int width);

	/**
	* Get the expected width for a column. This is the actual width for fixed-
	* width columns, the specified width (defaulting to 50) for variable-width
	* columns.
	* @return the expected column width of the each data value. Does not include
	* "overhead" space such as for the null-value vector or offset vector
	*/
	int expectedWidth();

	/**
	* For an array column, specify the expected average array cardinality.
	* Ignored for non-array columns. Used when allocating new vectors.
	*
	* @param childCount the expected average array cardinality. Defaults to
	* 1 for non-array columns, 10 for array columns
	*/
	void setExpectedElementCount(int childCount);

	/**
	* Returns the expected array cardinality for array columns, or 1 for
	* non-array columns.
	*
	* @return the expected value cardinality per value (per-row for top-level
	* columns, per array element for arrays within lists)
	*/
	int expectedElementCount();

	void setFormat(String value);

	String format();

	/**
	* Returns the formatter to use for date/time values. Only valid for
	* date/time columns.
	*
	* @return
	*/
	DateTimeFormatter dateTimeFormatter();

	/**
	* Sets the default value property using the string-encoded form of the value.
	* The default value is used for filling a vector when no real data is available.
	*
	* @param value the default value in String representation
	*/
	void setDefaultValue(String value);

	/**
	* Returns the default value for this column in String literal representation.
	*
	* @return the default value in String literal representation, or null if no
	* default value has been set
	*/
	String defaultValue();

	/**
	* Returns the default value decoded into object form. This is the same as:
	* <pre><code>decodeValue(defaultValue());
	* </code></pre>
	*
	* @return the default value decode as an object that can be passed to
	* the {@link ColumnWriter#setObject()} method.
	*/
	Object decodeDefaultValue();

	String valueToString(Object value);
	Object valueFromString(String value);

	/**
	* Create an empty version of this column. If the column is a scalar,
	* produces a simple copy. If a map, produces a clone without child
	* columns.
	*
	* @return empty clone of this column
	*/
	ColumnMetadata cloneEmpty();

	int precision();
	int scale();

	void bind(TupleMetadata parentTuple);

	ColumnMetadata copy();

	/**
	* Converts type metadata into string representation
	* accepted by the table schema parser.
	*
	* @return type metadata string representation
	*/
	String typeString();

	/**
	* Converts column metadata into string representation
	* accepted by the table schema parser.
	*
	* @return column metadata string representation
	*/
	String columnString();
	}