rust/datafusion/src/lib.rs - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 #![warn(missing_docs)]
 // Clippy lints, some should be disabled incrementally
 #![allow(
     clippy::float_cmp,
     clippy::from_over_into,
     clippy::module_inception,
     clippy::new_without_default,
     clippy::type_complexity,
     clippy::upper_case_acronyms
 )]

 //! [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion)
 //! is an extensible query execution framework that uses
 //! [Apache Arrow](https://arrow.apache.org) as its in-memory format.
 //!
 //! DataFusion supports both an SQL and a DataFrame API for building logical query plans
 //! as well as a query optimizer and execution engine capable of parallel execution
 //! against partitioned data sources (CSV and Parquet) using threads.
 //!
 //! Below is an example of how to execute a query against data stored
 //! in a CSV file using a [`DataFrame`](dataframe::DataFrame):
 //!
 //! ```rust
 //! # use datafusion::prelude::*;
 //! # use datafusion::error::Result;
 //! # use arrow::record_batch::RecordBatch;
 //!
 //! # #[tokio::main]
 //! # async fn main() -> Result<()> {
 //! let mut ctx = ExecutionContext::new();
 //!
 //! // create the dataframe
 //! let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
 //!
 //! // create a plan
 //! let df = df.filter(col("a").lt_eq(col("b")))?
 //!            .aggregate(vec![col("a")], vec![min(col("b"))])?
 //!            .limit(100)?;
 //!
 //! // execute the plan
 //! let results: Vec<RecordBatch> = df.collect().await?;
 //!
 //! // format the results
 //! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?;
 //!
 //! let expected = vec![
 //!     "+---+--------+",
 //!     "| a | MIN(b) |",
 //!     "+---+--------+",
 //!     "| 1 | 2      |",
 //!     "+---+--------+"
 //! ];
 //!
 //! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
 //! # Ok(())
 //! # }
 //! ```
 //!
 //! and how to execute a query against a CSV using SQL:
 //!
 //! ```
 //! # use datafusion::prelude::*;
 //! # use datafusion::error::Result;
 //! # use arrow::record_batch::RecordBatch;
 //!
 //! # #[tokio::main]
 //! # async fn main() -> Result<()> {
 //! let mut ctx = ExecutionContext::new();
 //!
 //! ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?;
 //!
 //! // create a plan
 //! let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?;
 //!
 //! // execute the plan
 //! let results: Vec<RecordBatch> = df.collect().await?;
 //!
 //! // format the results
 //! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?;
 //!
 //! let expected = vec![
 //!     "+---+--------+",
 //!     "| a | MIN(b) |",
 //!     "+---+--------+",
 //!     "| 1 | 2      |",
 //!     "+---+--------+"
 //! ];
 //!
 //! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
 //! # Ok(())
 //! # }
 //! ```
 //!
 //! ## Parse, Plan, Optimize, Execute
 //!
 //! DataFusion is a fully fledged query engine capable of performing complex operations.
 //! Specifically, when DataFusion receives an SQL query, there are different steps
 //! that it passes through until a result is obtained. Broadly, they are:
 //!
 //! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.6.1/sqlparser/).
 //! 2. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical expressions on the AST to logical expressions [`Expr`s](logical_plan::Expr).
 //! 3. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical nodes on the AST to a [`LogicalPlan`](logical_plan::LogicalPlan).
 //! 4. [`OptimizerRules`](optimizer::optimizer::OptimizerRule) are applied to the [`LogicalPlan`](logical_plan::LogicalPlan) to optimize it.
 //! 5. The [`LogicalPlan`](logical_plan::LogicalPlan) is converted to an [`ExecutionPlan`](physical_plan::ExecutionPlan) by a [`PhysicalPlanner`](physical_plan::PhysicalPlanner)
 //! 6. The [`ExecutionPlan`](physical_plan::ExecutionPlan) is executed against data through the [`ExecutionContext`](execution::context::ExecutionContext)
 //!
 //! With a [`DataFrame`](dataframe::DataFrame) API, steps 1-3 are not used as the DataFrame builds the [`LogicalPlan`](logical_plan::LogicalPlan) directly.
 //!
 //! Phases 1-5 are typically cheap when compared to phase 6, and thus DataFusion puts a
 //! lot of effort to ensure that phase 6 runs efficiently and without errors.
 //!
 //! DataFusion's planning is divided in two main parts: logical planning and physical planning.
 //!
 //! ### Logical plan
 //!
 //! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr).
 //! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed.
 //!
 //! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)).
 //! All of these are located in [`logical_plan`](logical_plan).
 //!
 //! ### Physical plan
 //!
 //! A Physical plan ([`ExecutionPlan`](physical_plan::ExecutionPlan)) is a plan that can be executed against data.
 //! Contrarily to a logical plan, the physical plan has concrete information about how the calculation
 //! should be performed (e.g. what Rust functions are used) and how data should be loaded into memory.
 //!
 //! [`ExecutionPlan`](physical_plan::ExecutionPlan) uses the Arrow format as its in-memory representation of data, through the [arrow] crate.
 //! We recommend going through [its documentation](arrow) for details on how the data is physically represented.
 //!
 //! A [`ExecutionPlan`](physical_plan::ExecutionPlan) is composed by nodes (implement the trait [`ExecutionPlan`](physical_plan::ExecutionPlan)),
 //! and each node is composed by physical expressions ([`PhysicalExpr`](physical_plan::PhysicalExpr))
 //! or aggreagate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)).
 //! All of these are located in the module [`physical_plan`](physical_plan).
 //!
 //! Broadly speaking,
 //!
 //! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns
 //!   an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch)
 //!   (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader))
 //! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch)
 //!   and returns an [`Array`](arrow::array::Array)
 //! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch)
 //!   and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*)
 //!
 //! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition.
 //!
 //! The following physical nodes are currently implemented:
 //!
 //! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec)
 //! * Filter: [`FilterExec`](physical_plan::filter::FilterExec)
 //! * Hash and Grouped aggregations: [`HashAggregateExec`](physical_plan::hash_aggregate::HashAggregateExec)
 //! * Sort: [`SortExec`](physical_plan::sort::SortExec)
 //! * Merge (partitions): [`MergeExec`](physical_plan::merge::MergeExec)
 //! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec)
 //! * Scan a CSV: [`CsvExec`](physical_plan::csv::CsvExec)
 //! * Scan a Parquet: [`ParquetExec`](physical_plan::parquet::ParquetExec)
 //! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec)
 //! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec)
 //!
 //! ## Customize
 //!
 //! DataFusion allows users to
 //! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`](execution::context::QueryPlanner))
 //! * declare and use user-defined scalar functions ([`ScalarUDF`](physical_plan::udf::ScalarUDF))
 //! * declare and use user-defined aggregate functions ([`AggregateUDF`](physical_plan::udaf::AggregateUDF))
 //!
 //! you can find examples of each of them in examples section.

 extern crate arrow;
 extern crate sqlparser;

 pub mod catalog;
 pub mod dataframe;
 pub mod datasource;
 pub mod error;
 pub mod execution;
 pub mod logical_plan;
 pub mod optimizer;
 pub mod physical_optimizer;
 pub mod physical_plan;
 pub mod prelude;
 pub mod scalar;
 pub mod sql;
 pub mod variable;

 #[cfg(test)]
 pub mod test;

 #[macro_use]
 #[cfg(feature = "regex_expressions")]
 extern crate lazy_static;

 #[cfg(doctest)]
 doc_comment::doctest!("../README.md", readme_example_test);
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	#![warn(missing_docs)]
	// Clippy lints, some should be disabled incrementally
	#![allow(
	clippy::float_cmp,
	clippy::from_over_into,
	clippy::module_inception,
	clippy::new_without_default,
	clippy::type_complexity,
	clippy::upper_case_acronyms
	)]

	//! [DataFusion](https://github.com/apache/arrow/tree/master/rust/datafusion)
	//! is an extensible query execution framework that uses
	//! [Apache Arrow](https://arrow.apache.org) as its in-memory format.
	//!
	//! DataFusion supports both an SQL and a DataFrame API for building logical query plans
	//! as well as a query optimizer and execution engine capable of parallel execution
	//! against partitioned data sources (CSV and Parquet) using threads.
	//!
	//! Below is an example of how to execute a query against data stored
	//! in a CSV file using a [`DataFrame`](dataframe::DataFrame):
	//!
	//! ```rust
	//! # use datafusion::prelude::*;
	//! # use datafusion::error::Result;
	//! # use arrow::record_batch::RecordBatch;
	//!
	//! # #[tokio::main]
	//! # async fn main() -> Result<()> {
	//! let mut ctx = ExecutionContext::new();
	//!
	//! // create the dataframe
	//! let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
	//!
	//! // create a plan
	//! let df = df.filter(col("a").lt_eq(col("b")))?
	//! .aggregate(vec![col("a")], vec![min(col("b"))])?
	//! .limit(100)?;
	//!
	//! // execute the plan
	//! let results: Vec<RecordBatch> = df.collect().await?;
	//!
	//! // format the results
	//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?;
	//!
	//! let expected = vec![
	//! "+---+--------+",
	//! "\| a \| MIN(b) \|",
	//! "+---+--------+",
	//! "\| 1 \| 2 \|",
	//! "+---+--------+"
	//! ];
	//!
	//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
	//! # Ok(())
	//! # }
	//! ```
	//!
	//! and how to execute a query against a CSV using SQL:
	//!
	//! ```
	//! # use datafusion::prelude::*;
	//! # use datafusion::error::Result;
	//! # use arrow::record_batch::RecordBatch;
	//!
	//! # #[tokio::main]
	//! # async fn main() -> Result<()> {
	//! let mut ctx = ExecutionContext::new();
	//!
	//! ctx.register_csv("example", "tests/example.csv", CsvReadOptions::new())?;
	//!
	//! // create a plan
	//! let df = ctx.sql("SELECT a, MIN(b) FROM example GROUP BY a LIMIT 100")?;
	//!
	//! // execute the plan
	//! let results: Vec<RecordBatch> = df.collect().await?;
	//!
	//! // format the results
	//! let pretty_results = arrow::util::pretty::pretty_format_batches(&results)?;
	//!
	//! let expected = vec![
	//! "+---+--------+",
	//! "\| a \| MIN(b) \|",
	//! "+---+--------+",
	//! "\| 1 \| 2 \|",
	//! "+---+--------+"
	//! ];
	//!
	//! assert_eq!(pretty_results.trim().lines().collect::<Vec<_>>(), expected);
	//! # Ok(())
	//! # }
	//! ```
	//!
	//! ## Parse, Plan, Optimize, Execute
	//!
	//! DataFusion is a fully fledged query engine capable of performing complex operations.
	//! Specifically, when DataFusion receives an SQL query, there are different steps
	//! that it passes through until a result is obtained. Broadly, they are:
	//!
	//! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.6.1/sqlparser/).
	//! 2. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical expressions on the AST to logical expressions [`Expr`s](logical_plan::Expr).
	//! 3. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical nodes on the AST to a [`LogicalPlan`](logical_plan::LogicalPlan).
	//! 4. [`OptimizerRules`](optimizer::optimizer::OptimizerRule) are applied to the [`LogicalPlan`](logical_plan::LogicalPlan) to optimize it.
	//! 5. The [`LogicalPlan`](logical_plan::LogicalPlan) is converted to an [`ExecutionPlan`](physical_plan::ExecutionPlan) by a [`PhysicalPlanner`](physical_plan::PhysicalPlanner)
	//! 6. The [`ExecutionPlan`](physical_plan::ExecutionPlan) is executed against data through the [`ExecutionContext`](execution::context::ExecutionContext)
	//!
	//! With a [`DataFrame`](dataframe::DataFrame) API, steps 1-3 are not used as the DataFrame builds the [`LogicalPlan`](logical_plan::LogicalPlan) directly.
	//!
	//! Phases 1-5 are typically cheap when compared to phase 6, and thus DataFusion puts a
	//! lot of effort to ensure that phase 6 runs efficiently and without errors.
	//!
	//! DataFusion's planning is divided in two main parts: logical planning and physical planning.
	//!
	//! ### Logical plan
	//!
	//! Logical planning yields [`logical plans`](logical_plan::LogicalPlan) and [`logical expressions`](logical_plan::Expr).
	//! These are [`Schema`](arrow::datatypes::Schema)-aware traits that represent statements whose result is independent of how it should physically be executed.
	//!
	//! A [`LogicalPlan`](logical_plan::LogicalPlan) is a Direct Asyclic graph of other [`LogicalPlan`s](logical_plan::LogicalPlan) and each node contains logical expressions ([`Expr`s](logical_plan::Expr)).
	//! All of these are located in [`logical_plan`](logical_plan).
	//!
	//! ### Physical plan
	//!
	//! A Physical plan ([`ExecutionPlan`](physical_plan::ExecutionPlan)) is a plan that can be executed against data.
	//! Contrarily to a logical plan, the physical plan has concrete information about how the calculation
	//! should be performed (e.g. what Rust functions are used) and how data should be loaded into memory.
	//!
	//! [`ExecutionPlan`](physical_plan::ExecutionPlan) uses the Arrow format as its in-memory representation of data, through the [arrow] crate.
	//! We recommend going through [its documentation](arrow) for details on how the data is physically represented.
	//!
	//! A [`ExecutionPlan`](physical_plan::ExecutionPlan) is composed by nodes (implement the trait [`ExecutionPlan`](physical_plan::ExecutionPlan)),
	//! and each node is composed by physical expressions ([`PhysicalExpr`](physical_plan::PhysicalExpr))
	//! or aggreagate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)).
	//! All of these are located in the module [`physical_plan`](physical_plan).
	//!
	//! Broadly speaking,
	//!
	//! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns
	//! an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch)
	//! (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader))
	//! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch)
	//! and returns an [`Array`](arrow::array::Array)
	//! * an [`AggregateExpr`](physical_plan::AggregateExpr) receives [`RecordBatch`es](arrow::record_batch::RecordBatch)
	//! and returns a [`RecordBatch`](arrow::record_batch::RecordBatch) of a single row(*)
	//!
	//! (*) Technically, it aggregates the results on each partition and then merges the results into a single partition.
	//!
	//! The following physical nodes are currently implemented:
	//!
	//! * Projection: [`ProjectionExec`](physical_plan::projection::ProjectionExec)
	//! * Filter: [`FilterExec`](physical_plan::filter::FilterExec)
	//! * Hash and Grouped aggregations: [`HashAggregateExec`](physical_plan::hash_aggregate::HashAggregateExec)
	//! * Sort: [`SortExec`](physical_plan::sort::SortExec)
	//! * Merge (partitions): [`MergeExec`](physical_plan::merge::MergeExec)
	//! * Limit: [`LocalLimitExec`](physical_plan::limit::LocalLimitExec) and [`GlobalLimitExec`](physical_plan::limit::GlobalLimitExec)
	//! * Scan a CSV: [`CsvExec`](physical_plan::csv::CsvExec)
	//! * Scan a Parquet: [`ParquetExec`](physical_plan::parquet::ParquetExec)
	//! * Scan from memory: [`MemoryExec`](physical_plan::memory::MemoryExec)
	//! * Explain the plan: [`ExplainExec`](physical_plan::explain::ExplainExec)
	//!
	//! ## Customize
	//!
	//! DataFusion allows users to
	//! * extend the planner to use user-defined logical and physical nodes ([`QueryPlanner`](execution::context::QueryPlanner))
	//! * declare and use user-defined scalar functions ([`ScalarUDF`](physical_plan::udf::ScalarUDF))
	//! * declare and use user-defined aggregate functions ([`AggregateUDF`](physical_plan::udaf::AggregateUDF))
	//!
	//! you can find examples of each of them in examples section.

	extern crate arrow;
	extern crate sqlparser;

	pub mod catalog;
	pub mod dataframe;
	pub mod datasource;
	pub mod error;
	pub mod execution;
	pub mod logical_plan;
	pub mod optimizer;
	pub mod physical_optimizer;
	pub mod physical_plan;
	pub mod prelude;
	pub mod scalar;
	pub mod sql;
	pub mod variable;

	#[cfg(test)]
	pub mod test;

	#[macro_use]
	#[cfg(feature = "regex_expressions")]
	extern crate lazy_static;

	#[cfg(doctest)]
	doc_comment::doctest!("../README.md", readme_example_test);