exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/scan/framework/SchemaNegotiator.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.physical.impl.scan.framework;

 import org.apache.drill.common.exceptions.CustomErrorContext;
 import org.apache.drill.exec.ops.OperatorContext;
 import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
 import org.apache.drill.exec.physical.resultSet.RowSetLoader;
 import org.apache.drill.exec.record.metadata.TupleMetadata;
 import org.apache.drill.exec.server.options.OptionSet;

 import com.typesafe.config.Config;

 /**
  * Negotiates the table schema with the scanner framework and provides
  * context information for the reader. Scans use either a "dynamic" or
  * a defined schema.
  * <p>
  * Regardless of the schema type, the result of building the schema is a
  * result set loader used to prepare batches for use in the query. The reader
  * can simply read all columns, allowing the framework to discard unwanted
  * values. Or for efficiency, the reader can check the column metadata to
  * determine if a column is projected, and if not, then don't even read
  * the column from the input source.
  *
  * <h4>Defined Schema</h4>
  *
  * If defined, the execution plan provides the output schema (presumably
  * computed from an accurate metadata source.) The reader must populate
  * the proscribed rows, performing column type conversions as needed.
  * The reader can determine if the schema is defined by calling
  * {@link hasOutputSchema()}.
  * <p>
  * At present, the scan framework filters the "provided schema" against
  * the project list so that this class presents only the actual output
  * schema. Future versions may do the filtering in the planner, but
  * the result for readers will be the same either way.
  *
  * <h4>Dynamic Schema</h4>
  *
  * A dynamic schema occurs when the plan does not specify a schema.
  * Drill is unique in its support for "schema on read" in the sense
  * that Drill does not know the schema until the reader defines it at
  * scan time.
  * <p>
  * The reader and scan framework coordinate to form the output schema.
  * The reader offers the columns it has available. The scan framework
  * uses the projection list to decide which to accept. Either way the
  * scan framework provides a column reader for the column (returning a
  * do-nothing "dummy" reader if the column is unprojected.)
  * <p>
  * With a dynamic schema, readers offer a schema in one of two ways:
  * <p>
  * The reader provides the table schema in one of two ways: early schema
  * or late schema. Either way, the project list from the physical plan
  * determines which
  * table columns are materialized and which are not. Readers are provided
  * for all table columns for readers that must read sequentially, but
  * only the materialized columns are written to value vectors.
  *
  * <h4>Early Dynamic Schema</h4>
  *
  * Some readers can determine the source schema at the start of a scan.
  * For example, a CSV file has headers, a Parquet file has footers, both
  * of which define a schema. This case is called "early schema." The
  * reader fefines the schema by calling
  * {@link #tableSchema(TupleMetadata)} to provide the known schema.
  *
  * <h4>Late Dynamic Schema</h4>
  *
  * Other readers don't know the input schema until the reader actually
  * reads the data. For example, JSON typically has no schema, but does
  * have sufficient structure (name/value pairs) to infer one.
  * <p>
  * The late schema reader calls {@link RowSetLoader#addColumn()} to
  * add each column as it is discovered during the scan.
  * <p>
  * Note that, to avoid schema conflicts, a late schema reader
  * <i><b>must</b></i> define the full set of columns in the first batch,
  * and must stick to that schema for all subsequent batches. This allows
  * the reader to look one batch ahead to learn the columns.
  * <p>
  * Drill, however, cannot predict the future. Without a defined schema,
  * downstream operators cannot know which columns might appear later
  * in the scan, with which types. Today this is a strong guideline.
  * Future versions may enforce this rule.
  */
 public interface SchemaNegotiator {

   OperatorContext context();
   Config drillConfig();
   OptionSet queryOptions();

   /**
    * Specify an advanced error context which allows the reader to
    * fill in custom context values.
    */
   void setErrorContext(CustomErrorContext context);

   /**
    * Name of the user running the query.
    */
   String userName();

   /**
    * Report if the execution plan defines a provided schema. If so,
    * the reader should use that schema, converting or ignoring columns
    * as needed. A scan without a provided schema has a "dynamic" schema
    * to be defined by the scan operator itself along with the column
    * projection list.
    *
    * @return {@code true} if the execution plan defines the output
    * schema, {@code false} if the schema should be computed dynamically
    * from the source schema and column projections
    */
   boolean hasProvidedSchema();

   /**
    * Returns the provided schema, if defined. The provided schema is a
    * description of the source schema viewed as a Drill schema.
    *
    * @return the output schema, if {@link #hasProvidedSchema()} returns
    * {@code true}, {@code null} otherwise
    */
   TupleMetadata providedSchema();

   /**
    * Specify the table schema if this is an early-schema reader. Need
    * not be called for a late-schema readers. The schema provided here,
    * if any, is a base schema: the reader is free to discover additional
    * columns during the read.
    * <p>
    * Should only be called if the schema is dynamic, that is, if
    * {@link #hasProvidedSchema()} returns false.
    *
    * @param schema the table schema if known at open time
    * @param isComplete true if the schema is complete: if it can be used
    * to define an empty schema-only batch for the first reader. Set to
    * false if the schema is partial: if the reader must read rows to
    * determine the full schema
    */
   void tableSchema(TupleMetadata schema, boolean isComplete);

   /**
    * Set the preferred batch size (which may be overridden by the
    * result set loader in order to limit vector or batch size.)
    *
    * @param maxRecordsPerBatch preferred number of record per batch
    */
   void batchSize(int maxRecordsPerBatch);

   /**
    * Build the schema, plan the required projections and static
    * columns and return a loader used to populate value vectors.
    * If the select list includes a subset of table columns, then
    * the loader will be set up in table schema order, but the unneeded
    * column loaders will be null, meaning that the batch reader should
    * skip setting those columns.
    *
    * @return the loader for the table with columns arranged in table
    * schema order
    */
   ResultSetLoader build();

   /**
    * Report whether the projection list is empty, as occurs in two
    * cases:
    * <ul>
    * <li><tt>SELECT COUNT(*) ...</tt> -- empty project.</ul>
    * <li><tt>SELECT a, b FROM table(c d)</tt> -- disjoint project.</li>
    * </ul>
    * @return true if no columns are projected, and the client can
    * make use of {@link ResultSetLoader#skipRows(int)} to indicate the
    * row count, false if at least one column is projected and so
    * data must be written using the loader
    */
   boolean isProjectionEmpty();

   /**
    * The context to use as a parent when creating a custom context.
    * <p>
    * (Obtain the error context for this reader from the
    * {@link ResultSetLoader}.
    */
   CustomErrorContext parentErrorContext();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.physical.impl.scan.framework;

	import org.apache.drill.common.exceptions.CustomErrorContext;
	import org.apache.drill.exec.ops.OperatorContext;
	import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
	import org.apache.drill.exec.physical.resultSet.RowSetLoader;
	import org.apache.drill.exec.record.metadata.TupleMetadata;
	import org.apache.drill.exec.server.options.OptionSet;

	import com.typesafe.config.Config;

	/**
	* Negotiates the table schema with the scanner framework and provides
	* context information for the reader. Scans use either a "dynamic" or
	* a defined schema.
	* <p>
	* Regardless of the schema type, the result of building the schema is a
	* result set loader used to prepare batches for use in the query. The reader
	* can simply read all columns, allowing the framework to discard unwanted
	* values. Or for efficiency, the reader can check the column metadata to
	* determine if a column is projected, and if not, then don't even read
	* the column from the input source.
	*
	* <h4>Defined Schema</h4>
	*
	* If defined, the execution plan provides the output schema (presumably
	* computed from an accurate metadata source.) The reader must populate
	* the proscribed rows, performing column type conversions as needed.
	* The reader can determine if the schema is defined by calling
	* {@link hasOutputSchema()}.
	* <p>
	* At present, the scan framework filters the "provided schema" against
	* the project list so that this class presents only the actual output
	* schema. Future versions may do the filtering in the planner, but
	* the result for readers will be the same either way.
	*
	* <h4>Dynamic Schema</h4>
	*
	* A dynamic schema occurs when the plan does not specify a schema.
	* Drill is unique in its support for "schema on read" in the sense
	* that Drill does not know the schema until the reader defines it at
	* scan time.
	* <p>
	* The reader and scan framework coordinate to form the output schema.
	* The reader offers the columns it has available. The scan framework
	* uses the projection list to decide which to accept. Either way the
	* scan framework provides a column reader for the column (returning a
	* do-nothing "dummy" reader if the column is unprojected.)
	* <p>
	* With a dynamic schema, readers offer a schema in one of two ways:
	* <p>
	* The reader provides the table schema in one of two ways: early schema
	* or late schema. Either way, the project list from the physical plan
	* determines which
	* table columns are materialized and which are not. Readers are provided
	* for all table columns for readers that must read sequentially, but
	* only the materialized columns are written to value vectors.
	*
	* <h4>Early Dynamic Schema</h4>
	*
	* Some readers can determine the source schema at the start of a scan.
	* For example, a CSV file has headers, a Parquet file has footers, both
	* of which define a schema. This case is called "early schema." The
	* reader fefines the schema by calling
	* {@link #tableSchema(TupleMetadata)} to provide the known schema.
	*
	* <h4>Late Dynamic Schema</h4>
	*
	* Other readers don't know the input schema until the reader actually
	* reads the data. For example, JSON typically has no schema, but does
	* have sufficient structure (name/value pairs) to infer one.
	* <p>
	* The late schema reader calls {@link RowSetLoader#addColumn()} to
	* add each column as it is discovered during the scan.
	* <p>
	* Note that, to avoid schema conflicts, a late schema reader
	* <i><b>must</b></i> define the full set of columns in the first batch,
	* and must stick to that schema for all subsequent batches. This allows
	* the reader to look one batch ahead to learn the columns.
	* <p>
	* Drill, however, cannot predict the future. Without a defined schema,
	* downstream operators cannot know which columns might appear later
	* in the scan, with which types. Today this is a strong guideline.
	* Future versions may enforce this rule.
	*/
	public interface SchemaNegotiator {

	OperatorContext context();
	Config drillConfig();
	OptionSet queryOptions();

	/**
	* Specify an advanced error context which allows the reader to
	* fill in custom context values.
	*/
	void setErrorContext(CustomErrorContext context);

	/**
	* Name of the user running the query.
	*/
	String userName();

	/**
	* Report if the execution plan defines a provided schema. If so,
	* the reader should use that schema, converting or ignoring columns
	* as needed. A scan without a provided schema has a "dynamic" schema
	* to be defined by the scan operator itself along with the column
	* projection list.
	*
	* @return {@code true} if the execution plan defines the output
	* schema, {@code false} if the schema should be computed dynamically
	* from the source schema and column projections
	*/
	boolean hasProvidedSchema();

	/**
	* Returns the provided schema, if defined. The provided schema is a
	* description of the source schema viewed as a Drill schema.
	*
	* @return the output schema, if {@link #hasProvidedSchema()} returns
	* {@code true}, {@code null} otherwise
	*/
	TupleMetadata providedSchema();

	/**
	* Specify the table schema if this is an early-schema reader. Need
	* not be called for a late-schema readers. The schema provided here,
	* if any, is a base schema: the reader is free to discover additional
	* columns during the read.
	* <p>
	* Should only be called if the schema is dynamic, that is, if
	* {@link #hasProvidedSchema()} returns false.
	*
	* @param schema the table schema if known at open time
	* @param isComplete true if the schema is complete: if it can be used
	* to define an empty schema-only batch for the first reader. Set to
	* false if the schema is partial: if the reader must read rows to
	* determine the full schema
	*/
	void tableSchema(TupleMetadata schema, boolean isComplete);

	/**
	* Set the preferred batch size (which may be overridden by the
	* result set loader in order to limit vector or batch size.)
	*
	* @param maxRecordsPerBatch preferred number of record per batch
	*/
	void batchSize(int maxRecordsPerBatch);

	/**
	* Build the schema, plan the required projections and static
	* columns and return a loader used to populate value vectors.
	* If the select list includes a subset of table columns, then
	* the loader will be set up in table schema order, but the unneeded
	* column loaders will be null, meaning that the batch reader should
	* skip setting those columns.
	*
	* @return the loader for the table with columns arranged in table
	* schema order
	*/
	ResultSetLoader build();

	/**
	* Report whether the projection list is empty, as occurs in two
	* cases:
	* <ul>
	* <li><tt>SELECT COUNT(*) ...</tt> -- empty project.</ul>
	* <li><tt>SELECT a, b FROM table(c d)</tt> -- disjoint project.</li>
	* </ul>
	* @return true if no columns are projected, and the client can
	* make use of {@link ResultSetLoader#skipRows(int)} to indicate the
	* row count, false if at least one column is projected and so
	* data must be written using the loader
	*/
	boolean isProjectionEmpty();

	/**
	* The context to use as a parent when creating a custom context.
	* <p>
	* (Obtain the error context for this reader from the
	* {@link ResultSetLoader}.
	*/
	CustomErrorContext parentErrorContext();
	}