blob: 955dbb86602a35a25e17b03e95a2eeeb29add79d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::collections::BTreeSet;
use std::ops::ControlFlow;
use datafusion_common::{DataFusionError, Result};
use crate::TableReference;
use crate::parser::{CopyToSource, CopyToStatement, Statement as DFStatement};
use crate::planner::object_name_to_table_reference;
use sqlparser::ast::*;
// following constants are used in `resolve_table_references`
// and should be same as `datafusion/catalog/src/information_schema.rs`
const INFORMATION_SCHEMA: &str = "information_schema";
const TABLES: &str = "tables";
const VIEWS: &str = "views";
const COLUMNS: &str = "columns";
const DF_SETTINGS: &str = "df_settings";
const SCHEMATA: &str = "schemata";
const ROUTINES: &str = "routines";
const PARAMETERS: &str = "parameters";
/// All information schema tables
const INFORMATION_SCHEMA_TABLES: &[&str] = &[
TABLES,
VIEWS,
COLUMNS,
DF_SETTINGS,
SCHEMATA,
ROUTINES,
PARAMETERS,
];
// Collect table/CTE references as `TableReference`s and normalize them during traversal.
// This avoids a second normalization/conversion pass after visiting the AST.
struct RelationVisitor {
relations: BTreeSet<TableReference>,
all_ctes: BTreeSet<TableReference>,
ctes_in_scope: Vec<TableReference>,
enable_ident_normalization: bool,
}
impl RelationVisitor {
/// Record the reference to `relation`, if it's not a CTE reference.
fn insert_relation(&mut self, relation: &ObjectName) -> ControlFlow<DataFusionError> {
match object_name_to_table_reference(
relation.clone(),
self.enable_ident_normalization,
) {
Ok(relation) => {
if !self.relations.contains(&relation)
&& !self.ctes_in_scope.contains(&relation)
{
self.relations.insert(relation);
}
ControlFlow::Continue(())
}
Err(e) => ControlFlow::Break(e),
}
}
}
impl Visitor for RelationVisitor {
type Break = DataFusionError;
fn pre_visit_relation(&mut self, relation: &ObjectName) -> ControlFlow<Self::Break> {
self.insert_relation(relation)
}
fn pre_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
if let Some(with) = &q.with {
for cte in &with.cte_tables {
// The non-recursive CTE name is not in scope when evaluating the CTE itself, so this is valid:
// `WITH t AS (SELECT * FROM t) SELECT * FROM t`
// Where the first `t` refers to a predefined table. So we are careful here
// to visit the CTE first, before putting it in scope.
if !with.recursive {
// This is a bit hackish as the CTE will be visited again as part of visiting `q`,
// but thankfully `insert_relation` is idempotent.
cte.visit(self)?;
}
let cte_name = ObjectName::from(vec![cte.alias.name.clone()]);
match object_name_to_table_reference(
cte_name,
self.enable_ident_normalization,
) {
Ok(cte_ref) => self.ctes_in_scope.push(cte_ref),
Err(e) => return ControlFlow::Break(e),
}
}
}
ControlFlow::Continue(())
}
fn post_visit_query(&mut self, q: &Query) -> ControlFlow<Self::Break> {
if let Some(with) = &q.with {
for _ in &with.cte_tables {
// Unwrap: We just pushed these in `pre_visit_query`
self.all_ctes.insert(self.ctes_in_scope.pop().unwrap());
}
}
ControlFlow::Continue(())
}
fn pre_visit_statement(&mut self, statement: &Statement) -> ControlFlow<Self::Break> {
if let Statement::ShowCreate {
obj_type: ShowCreateObject::Table | ShowCreateObject::View,
obj_name,
} = statement
{
self.insert_relation(obj_name)?;
}
// SHOW statements will later be rewritten into a SELECT from the information_schema
let requires_information_schema = matches!(
statement,
Statement::ShowFunctions { .. }
| Statement::ShowVariable { .. }
| Statement::ShowStatus { .. }
| Statement::ShowVariables { .. }
| Statement::ShowCreate { .. }
| Statement::ShowColumns { .. }
| Statement::ShowTables { .. }
| Statement::ShowCollation { .. }
);
if requires_information_schema {
for s in INFORMATION_SCHEMA_TABLES {
// Information schema references are synthesized here, so convert directly.
let obj = ObjectName::from(vec![
Ident::new(INFORMATION_SCHEMA),
Ident::new(*s),
]);
match object_name_to_table_reference(obj, self.enable_ident_normalization)
{
Ok(tbl_ref) => {
self.relations.insert(tbl_ref);
}
Err(e) => return ControlFlow::Break(e),
}
}
}
ControlFlow::Continue(())
}
}
fn control_flow_to_result(flow: ControlFlow<DataFusionError>) -> Result<()> {
match flow {
ControlFlow::Continue(()) => Ok(()),
ControlFlow::Break(err) => Err(err),
}
}
fn visit_statement(statement: &DFStatement, visitor: &mut RelationVisitor) -> Result<()> {
match statement {
DFStatement::Statement(s) => {
control_flow_to_result(s.as_ref().visit(visitor))?;
}
DFStatement::CreateExternalTable(table) => {
control_flow_to_result(visitor.insert_relation(&table.name))?;
}
DFStatement::CopyTo(CopyToStatement { source, .. }) => match source {
CopyToSource::Relation(table_name) => {
control_flow_to_result(visitor.insert_relation(table_name))?;
}
CopyToSource::Query(query) => {
control_flow_to_result(query.visit(visitor))?;
}
},
DFStatement::Explain(explain) => {
visit_statement(&explain.statement, visitor)?;
}
DFStatement::Reset(_) => {}
}
Ok(())
}
/// Collects all tables and views referenced in the SQL statement. CTEs are collected separately.
/// This can be used to determine which tables need to be in the catalog for a query to be planned.
///
/// # Returns
///
/// A `(table_refs, ctes)` tuple, the first element contains table and view references and the second
/// element contains any CTE aliases that were defined and possibly referenced.
///
/// ## Example
///
/// ```
/// # use datafusion_sql::parser::DFParser;
/// # use datafusion_sql::resolve::resolve_table_references;
/// let query = "SELECT a FROM foo where x IN (SELECT y FROM bar)";
/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
/// assert_eq!(table_refs.len(), 2);
/// assert_eq!(table_refs[0].to_string(), "bar");
/// assert_eq!(table_refs[1].to_string(), "foo");
/// assert_eq!(ctes.len(), 0);
/// ```
///
/// ## Example with CTEs
///
/// ```
/// # use datafusion_sql::parser::DFParser;
/// # use datafusion_sql::resolve::resolve_table_references;
/// let query = "with my_cte as (values (1), (2)) SELECT * from my_cte;";
/// let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
/// let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
/// assert_eq!(table_refs.len(), 0);
/// assert_eq!(ctes.len(), 1);
/// assert_eq!(ctes[0].to_string(), "my_cte");
/// ```
pub fn resolve_table_references(
statement: &crate::parser::Statement,
enable_ident_normalization: bool,
) -> Result<(Vec<TableReference>, Vec<TableReference>)> {
let mut visitor = RelationVisitor {
relations: BTreeSet::new(),
all_ctes: BTreeSet::new(),
ctes_in_scope: vec![],
enable_ident_normalization,
};
visit_statement(statement, &mut visitor)?;
Ok((
visitor.relations.into_iter().collect(),
visitor.all_ctes.into_iter().collect(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolve_table_references_shadowed_cte() {
use crate::parser::DFParser;
// An interesting edge case where the `t` name is used both as an ordinary table reference
// and as a CTE reference.
let query = "WITH t AS (SELECT * FROM t) SELECT * FROM t";
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
assert_eq!(table_refs.len(), 1);
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "t");
assert_eq!(table_refs[0].to_string(), "t");
// UNION is a special case where the CTE is not in scope for the second branch.
let query = "(with t as (select 1) select * from t) union (select * from t)";
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
assert_eq!(table_refs.len(), 1);
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "t");
assert_eq!(table_refs[0].to_string(), "t");
// Nested CTEs are also handled.
// Here the first `u` is a CTE, but the second `u` is a table reference.
// While `t` is always a CTE.
let query = "(with t as (with u as (select 1) select * from u) select * from u cross join t)";
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
assert_eq!(table_refs.len(), 1);
assert_eq!(ctes.len(), 2);
assert_eq!(ctes[0].to_string(), "t");
assert_eq!(ctes[1].to_string(), "u");
assert_eq!(table_refs[0].to_string(), "u");
}
#[test]
fn resolve_table_references_recursive_cte() {
use crate::parser::DFParser;
let query = "
WITH RECURSIVE nodes AS (
SELECT 1 as id
UNION ALL
SELECT id + 1 as id
FROM nodes
WHERE id < 10
)
SELECT * FROM nodes
";
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
assert_eq!(table_refs.len(), 0);
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "nodes");
}
#[test]
fn resolve_table_references_cte_with_quoted_reference() {
use crate::parser::DFParser;
let query = r#"with barbaz as (select 1) select * from "barbaz""#;
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "barbaz");
// Quoted reference should still resolve to the CTE when normalization is on
assert_eq!(table_refs.len(), 0);
}
#[test]
fn resolve_table_references_cte_with_quoted_reference_normalization_off() {
use crate::parser::DFParser;
let query = r#"with barbaz as (select 1) select * from "barbaz""#;
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "barbaz");
// Even with normalization off, quoted reference matches same-case CTE name
assert_eq!(table_refs.len(), 0);
}
#[test]
fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_on() {
use crate::parser::DFParser;
let query = r#"with FOObar as (select 1) select * from "FOObar""#;
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, true).unwrap();
// CTE name is normalized to lowercase, quoted reference preserves case, so they differ
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "foobar");
assert_eq!(table_refs.len(), 1);
assert_eq!(table_refs[0].to_string(), "FOObar");
}
#[test]
fn resolve_table_references_cte_with_quoted_reference_uppercase_normalization_off() {
use crate::parser::DFParser;
let query = r#"with FOObar as (select 1) select * from "FOObar""#;
let statement = DFParser::parse_sql(query).unwrap().pop_back().unwrap();
let (table_refs, ctes) = resolve_table_references(&statement, false).unwrap();
// Without normalization, cases match exactly, so quoted reference resolves to the CTE
assert_eq!(ctes.len(), 1);
assert_eq!(ctes[0].to_string(), "FOObar");
assert_eq!(table_refs.len(), 0);
}
}