blob: 4b8bf0f4c0b2ad197eb8da46701550bb8a808d46 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! SQL Parser
//!
//! Note that most SQL parsing is now delegated to the sqlparser crate, which handles ANSI
//! SQL but this module contains DataFusion-specific SQL extensions.
use sqlparser::dialect::*;
use sqlparser::sqlast::*;
use sqlparser::sqlparser::*;
use sqlparser::sqltokenizer::*;
macro_rules! parser_err {
($MSG:expr) => {
Err(ParserError::ParserError($MSG.to_string()))
};
}
/// Types of files to parse as DataFrames
#[derive(Debug, Clone)]
pub enum FileType {
/// Newline-delimited JSON
NdJson,
/// Apache Parquet columnar storage
Parquet,
/// Comma separated values
CSV,
}
/// DataFrame AST Node representations.
///
/// Tokens parsed by `DFParser` are converted into these values.
#[derive(Debug, Clone)]
pub enum DFASTNode {
/// ANSI SQL AST node
ANSI(ASTNode),
/// DDL for creating an external table in DataFusion
CreateExternalTable {
/// Table name
name: String,
/// Optional schema
columns: Vec<SQLColumnDef>,
/// File type (Parquet, NDJSON, CSV)
file_type: FileType,
/// Header row?
header_row: bool,
/// Path to file
location: String,
},
}
/// SQL Parser
pub struct DFParser {
parser: Parser,
}
impl DFParser {
/// Parse the specified tokens
pub fn new(sql: String) -> Result<Self, ParserError> {
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize()?;
Ok(DFParser {
parser: Parser::new(tokens),
})
}
/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
pub fn parse_sql(sql: String) -> Result<DFASTNode, ParserError> {
let mut parser = DFParser::new(sql)?;
parser.parse()
}
/// Parse a new expression
pub fn parse(&mut self) -> Result<DFASTNode, ParserError> {
self.parse_expr(0)
}
/// Parse tokens until the precedence changes
fn parse_expr(&mut self, precedence: u8) -> Result<DFASTNode, ParserError> {
let mut expr = self.parse_prefix()?;
loop {
let next_precedence = self.parser.get_next_precedence()?;
if precedence >= next_precedence {
break;
}
if let Some(infix_expr) = self.parse_infix(expr.clone(), next_precedence)? {
expr = infix_expr;
}
}
Ok(expr)
}
/// Parse an expression prefix
fn parse_prefix(&mut self) -> Result<DFASTNode, ParserError> {
if self
.parser
.parse_keywords(vec!["CREATE", "EXTERNAL", "TABLE"])
{
match self.parser.next_token() {
Some(Token::Identifier(id)) => {
// parse optional column list (schema)
let mut columns = vec![];
if self.parser.consume_token(&Token::LParen) {
loop {
if let Some(Token::Identifier(column_name)) =
self.parser.next_token()
{
if let Ok(data_type) = self.parser.parse_data_type() {
let allow_null = if self
.parser
.parse_keywords(vec!["NOT", "NULL"])
{
false
} else if self.parser.parse_keyword("NULL") {
true
} else {
true
};
columns.push(SQLColumnDef {
name: column_name,
data_type: data_type,
allow_null,
default: None,
is_primary: false,
is_unique: false,
});
match self.parser.next_token() {
Some(Token::Comma) => continue,
Some(Token::RParen) => break,
_ => {
return parser_err!(
"Expected ',' or ')' after column definition"
);
}
}
} else {
return parser_err!(
"Error parsing data type in column definition"
);
}
} else {
return parser_err!("Error parsing column name");
}
}
}
let mut headers = true;
let file_type: FileType = if self
.parser
.parse_keywords(vec!["STORED", "AS", "CSV"])
{
if self.parser.parse_keywords(vec!["WITH", "HEADER", "ROW"]) {
headers = true;
} else if self
.parser
.parse_keywords(vec!["WITHOUT", "HEADER", "ROW"])
{
headers = false;
}
FileType::CSV
} else if self.parser.parse_keywords(vec!["STORED", "AS", "NDJSON"]) {
FileType::NdJson
} else if self.parser.parse_keywords(vec!["STORED", "AS", "PARQUET"])
{
FileType::Parquet
} else {
return parser_err!(format!(
"Expected 'STORED AS' clause, found {:?}",
self.parser.peek_token()
));
};
let location: String = if self.parser.parse_keywords(vec!["LOCATION"])
{
self.parser.parse_literal_string()?
} else {
return parser_err!("Missing 'LOCATION' clause");
};
Ok(DFASTNode::CreateExternalTable {
name: id,
columns,
file_type,
header_row: headers,
location,
})
}
_ => parser_err!(format!(
"Unexpected token after CREATE EXTERNAL TABLE: {:?}",
self.parser.peek_token()
)),
}
} else {
Ok(DFASTNode::ANSI(self.parser.parse_prefix()?))
}
}
/// Parse an infix operator
pub fn parse_infix(
&mut self,
_expr: DFASTNode,
_precedence: u8,
) -> Result<Option<DFASTNode>, ParserError> {
unimplemented!()
}
}