| // Copyright 2018 Grove Enterprises LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| //! SQL Tokenizer |
| //! |
| //! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens. |
| //! |
| //! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST). |
| |
| use std::iter::Peekable; |
| use std::str::Chars; |
| |
| use super::dialect::Dialect; |
| |
| /// SQL Token enumeration |
| #[derive(Debug, Clone, PartialEq)] |
| pub enum Token { |
| /// SQL identifier e.g. table or column name |
| Identifier(String), |
| /// SQL keyword e.g. Keyword("SELECT") |
| Keyword(String), |
| /// Numeric literal |
| Number(String), |
| /// A character that could not be tokenized |
| Char(char), |
| /// Single quoted string: i.e: 'string' |
| SingleQuotedString(String), |
| /// Double quoted string: i.e: "string" |
| DoubleQuotedString(String), |
| /// Comma |
| Comma, |
| /// Whitespace (space, tab, etc) |
| Whitespace(Whitespace), |
| /// Equality operator `=` |
| Eq, |
| /// Not Equals operator `!=` or `<>` |
| Neq, |
| /// Less Than operator `<` |
| Lt, |
| /// Greater han operator `>` |
| Gt, |
| /// Less Than Or Equals operator `<=` |
| LtEq, |
| /// Greater Than Or Equals operator `>=` |
| GtEq, |
| /// Plus operator `+` |
| Plus, |
| /// Minus operator `-` |
| Minus, |
| /// Multiplication operator `*` |
| Mult, |
| /// Division operator `/` |
| Div, |
| /// Modulo Operator `%` |
| Mod, |
| /// Left parenthesis `(` |
| LParen, |
| /// Right parenthesis `)` |
| RParen, |
| /// Period (used for compound identifiers or projections into nested types) |
| Period, |
| /// Colon `:` |
| Colon, |
| /// DoubleColon `::` (used for casting in postgresql) |
| DoubleColon, |
| /// SemiColon `;` used as separator for COPY and payload |
| SemiColon, |
| /// Backslash `\` used in terminating the COPY payload with `\.` |
| Backslash, |
| /// Left bracket `[` |
| LBracket, |
| /// Right bracket `]` |
| RBracket, |
| /// Ampersand & |
| Ampersand, |
| /// Left brace `{` |
| LBrace, |
| /// Right brace `}` |
| RBrace, |
| } |
| |
| impl ToString for Token { |
| fn to_string(&self) -> String { |
| match self { |
| Token::Identifier(ref id) => id.to_string(), |
| Token::Keyword(ref k) => k.to_string(), |
| Token::Number(ref n) => n.to_string(), |
| Token::Char(ref c) => c.to_string(), |
| Token::SingleQuotedString(ref s) => format!("'{}'", s), |
| Token::DoubleQuotedString(ref s) => format!("\"{}\"", s), |
| Token::Comma => ",".to_string(), |
| Token::Whitespace(ws) => ws.to_string(), |
| Token::Eq => "=".to_string(), |
| Token::Neq => "-".to_string(), |
| Token::Lt => "<".to_string(), |
| Token::Gt => ">".to_string(), |
| Token::LtEq => "<=".to_string(), |
| Token::GtEq => ">=".to_string(), |
| Token::Plus => "+".to_string(), |
| Token::Minus => "-".to_string(), |
| Token::Mult => "*".to_string(), |
| Token::Div => "/".to_string(), |
| Token::Mod => "%".to_string(), |
| Token::LParen => "(".to_string(), |
| Token::RParen => ")".to_string(), |
| Token::Period => ".".to_string(), |
| Token::Colon => ":".to_string(), |
| Token::DoubleColon => "::".to_string(), |
| Token::SemiColon => ";".to_string(), |
| Token::Backslash => "\\".to_string(), |
| Token::LBracket => "[".to_string(), |
| Token::RBracket => "]".to_string(), |
| Token::Ampersand => "&".to_string(), |
| Token::LBrace => "{".to_string(), |
| Token::RBrace => "}".to_string(), |
| } |
| } |
| } |
| |
| #[derive(Debug, Clone, PartialEq)] |
| pub enum Whitespace { |
| Space, |
| Newline, |
| Tab, |
| } |
| |
| impl ToString for Whitespace { |
| fn to_string(&self) -> String { |
| match self { |
| Whitespace::Space => " ".to_string(), |
| Whitespace::Newline => "\n".to_string(), |
| Whitespace::Tab => "\t".to_string(), |
| } |
| } |
| } |
| |
| /// Tokenizer error |
| #[derive(Debug, PartialEq)] |
| pub struct TokenizerError(String); |
| |
| /// SQL Tokenizer |
| pub struct Tokenizer<'a> { |
| dialect: &'a dyn Dialect, |
| pub query: String, |
| pub line: u64, |
| pub col: u64, |
| } |
| |
| impl<'a> Tokenizer<'a> { |
| /// Create a new SQL tokenizer for the specified SQL statement |
| pub fn new(dialect: &'a dyn Dialect, query: &str) -> Self { |
| Self { |
| dialect, |
| query: query.to_string(), |
| line: 1, |
| col: 1, |
| } |
| } |
| |
| fn is_keyword(&self, s: &str) -> bool { |
| //TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is |
| // not fast but I want the simplicity for now while I experiment with pluggable |
| // dialects |
| return self.dialect.keywords().contains(&s); |
| } |
| |
| /// Tokenize the statement and produce a vector of tokens |
| pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> { |
| let mut peekable = self.query.chars().peekable(); |
| |
| let mut tokens: Vec<Token> = vec![]; |
| |
| while let Some(token) = self.next_token(&mut peekable)? { |
| match &token { |
| Token::Whitespace(Whitespace::Newline) => { |
| self.line += 1; |
| self.col = 1; |
| } |
| |
| Token::Whitespace(Whitespace::Tab) => self.col += 4, |
| Token::Identifier(s) => self.col += s.len() as u64, |
| Token::Keyword(s) => self.col += s.len() as u64, |
| Token::Number(s) => self.col += s.len() as u64, |
| Token::SingleQuotedString(s) => self.col += s.len() as u64, |
| Token::DoubleQuotedString(s) => self.col += s.len() as u64, |
| _ => self.col += 1, |
| } |
| |
| tokens.push(token); |
| } |
| Ok(tokens) |
| } |
| |
| /// Get the next token or return None |
| fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<Token>, TokenizerError> { |
| //println!("next_token: {:?}", chars.peek()); |
| match chars.peek() { |
| Some(&ch) => match ch { |
| ' ' => { |
| chars.next(); |
| Ok(Some(Token::Whitespace(Whitespace::Space))) |
| } |
| '\t' => { |
| chars.next(); |
| Ok(Some(Token::Whitespace(Whitespace::Tab))) |
| } |
| '\n' => { |
| chars.next(); |
| Ok(Some(Token::Whitespace(Whitespace::Newline))) |
| } |
| // identifier or keyword |
| ch if self.dialect.is_identifier_start(ch) => { |
| let mut s = String::new(); |
| chars.next(); // consume |
| s.push(ch); |
| while let Some(&ch) = chars.peek() { |
| if self.dialect.is_identifier_part(ch) { |
| chars.next(); // consume |
| s.push(ch); |
| } else { |
| break; |
| } |
| } |
| let upper_str = s.to_uppercase(); |
| if self.is_keyword(upper_str.as_str()) { |
| Ok(Some(Token::Keyword(upper_str))) |
| } else { |
| Ok(Some(Token::Identifier(s))) |
| } |
| } |
| // string |
| '\'' => { |
| //TODO: handle escaped quotes in string |
| //TODO: handle EOF before terminating quote |
| let mut s = String::new(); |
| chars.next(); // consume |
| while let Some(&ch) = chars.peek() { |
| match ch { |
| '\'' => { |
| chars.next(); // consume |
| break; |
| } |
| _ => { |
| chars.next(); // consume |
| s.push(ch); |
| } |
| } |
| } |
| Ok(Some(Token::SingleQuotedString(s))) |
| } |
| // string |
| '"' => { |
| let mut s = String::new(); |
| chars.next(); // consume |
| while let Some(&ch) = chars.peek() { |
| match ch { |
| '"' => { |
| chars.next(); // consume |
| break; |
| } |
| _ => { |
| chars.next(); // consume |
| s.push(ch); |
| } |
| } |
| } |
| Ok(Some(Token::DoubleQuotedString(s))) |
| } |
| // numbers |
| '0'..='9' => { |
| let mut s = String::new(); |
| while let Some(&ch) = chars.peek() { |
| match ch { |
| '0'..='9' | '.' => { |
| chars.next(); // consume |
| s.push(ch); |
| } |
| _ => break, |
| } |
| } |
| Ok(Some(Token::Number(s))) |
| } |
| // punctuation |
| '(' => self.consume_and_return(chars, Token::LParen), |
| ')' => self.consume_and_return(chars, Token::RParen), |
| ',' => self.consume_and_return(chars, Token::Comma), |
| // operators |
| '+' => self.consume_and_return(chars, Token::Plus), |
| '-' => self.consume_and_return(chars, Token::Minus), |
| '*' => self.consume_and_return(chars, Token::Mult), |
| '/' => self.consume_and_return(chars, Token::Div), |
| '%' => self.consume_and_return(chars, Token::Mod), |
| '=' => self.consume_and_return(chars, Token::Eq), |
| '.' => self.consume_and_return(chars, Token::Period), |
| '!' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some(&ch) => match ch { |
| '=' => self.consume_and_return(chars, Token::Neq), |
| _ => Err(TokenizerError(format!( |
| "Tokenizer Error at Line: {}, Col: {}", |
| self.line, self.col |
| ))), |
| }, |
| None => Err(TokenizerError(format!( |
| "Tokenizer Error at Line: {}, Col: {}", |
| self.line, self.col |
| ))), |
| } |
| } |
| '<' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some(&ch) => match ch { |
| '=' => self.consume_and_return(chars, Token::LtEq), |
| '>' => self.consume_and_return(chars, Token::Neq), |
| _ => Ok(Some(Token::Lt)), |
| }, |
| None => Ok(Some(Token::Lt)), |
| } |
| } |
| '>' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some(&ch) => match ch { |
| '=' => self.consume_and_return(chars, Token::GtEq), |
| _ => Ok(Some(Token::Gt)), |
| }, |
| None => Ok(Some(Token::Gt)), |
| } |
| } |
| // colon |
| ':' => { |
| chars.next(); |
| match chars.peek() { |
| Some(&ch) => match ch { |
| // double colon |
| ':' => self.consume_and_return(chars, Token::DoubleColon), |
| _ => Ok(Some(Token::Colon)), |
| }, |
| None => Ok(Some(Token::Colon)), |
| } |
| } |
| ';' => self.consume_and_return(chars, Token::SemiColon), |
| '\\' => self.consume_and_return(chars, Token::Backslash), |
| // brakets |
| '[' => self.consume_and_return(chars, Token::LBracket), |
| ']' => self.consume_and_return(chars, Token::RBracket), |
| '&' => self.consume_and_return(chars, Token::Ampersand), |
| '{' => self.consume_and_return(chars, Token::LBrace), |
| '}' => self.consume_and_return(chars, Token::RBrace), |
| other => self.consume_and_return(chars, Token::Char(other)), |
| }, |
| None => Ok(None), |
| } |
| } |
| |
| fn consume_and_return( |
| &self, |
| chars: &mut Peekable<Chars>, |
| t: Token, |
| ) -> Result<Option<Token>, TokenizerError> { |
| chars.next(); |
| Ok(Some(t)) |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::super::dialect::GenericSqlDialect; |
| use super::*; |
| |
| #[test] |
| fn tokenize_select_1() { |
| let sql = String::from("SELECT 1"); |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::Keyword(String::from("SELECT")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1")), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_scalar_function() { |
| let sql = String::from("SELECT sqrt(1)"); |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::Keyword(String::from("SELECT")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Identifier(String::from("sqrt")), |
| Token::LParen, |
| Token::Number(String::from("1")), |
| Token::RParen, |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_simple_select() { |
| let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::Keyword(String::from("SELECT")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mult, |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword(String::from("FROM")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Identifier(String::from("customer")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword(String::from("WHERE")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Identifier(String::from("id")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Eq, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword(String::from("LIMIT")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("5")), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_string_predicate() { |
| let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::Keyword(String::from("SELECT")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mult, |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword(String::from("FROM")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Identifier(String::from("customer")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword(String::from("WHERE")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Identifier(String::from("salary")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Neq, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString(String::from("Not Provided")), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_invalid_string() { |
| let sql = String::from("\nمصطفىh"); |
| |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| println!("tokens: {:#?}", tokens); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Newline), |
| Token::Char('م'), |
| Token::Char('ص'), |
| Token::Char('ط'), |
| Token::Char('ف'), |
| Token::Char('ى'), |
| Token::Identifier("h".to_string()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_invalid_string_cols() { |
| let sql = String::from("\n\nSELECT * FROM table\tمصطفىh"); |
| |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| println!("tokens: {:#?}", tokens); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Newline), |
| Token::Whitespace(Whitespace::Newline), |
| Token::Keyword("SELECT".into()), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mult, |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword("FROM".into()), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword("TABLE".into()), |
| Token::Whitespace(Whitespace::Tab), |
| Token::Char('م'), |
| Token::Char('ص'), |
| Token::Char('ط'), |
| Token::Char('ف'), |
| Token::Char('ى'), |
| Token::Identifier("h".to_string()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_is_null() { |
| let sql = String::from("a IS NULL"); |
| let dialect = GenericSqlDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::Identifier(String::from("a")), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword("IS".to_string()), |
| Token::Whitespace(Whitespace::Space), |
| Token::Keyword("NULL".to_string()), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| fn compare(expected: Vec<Token>, actual: Vec<Token>) { |
| //println!("------------------------------"); |
| //println!("tokens = {:?}", actual); |
| //println!("expected = {:?}", expected); |
| //println!("------------------------------"); |
| assert_eq!(expected, actual); |
| } |
| } |