blob: 6d0c86ff2d5c5637e6e7921e7133294b0fd113af [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! SQL Tokenizer
//!
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
//!
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
#[cfg(not(feature = "std"))]
use alloc::{
borrow::ToOwned,
format,
string::{String, ToString},
vec,
vec::Vec,
};
use core::fmt;
use core::iter::Peekable;
use core::num::NonZeroU8;
use core::str::Chars;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
#[cfg(feature = "visitor")]
use sqlparser_derive::{Visit, VisitMut};
use crate::ast::DollarQuotedString;
use crate::dialect::Dialect;
use crate::dialect::{
BigQueryDialect, DuckDbDialect, GenericDialect, PostgreSqlDialect, SnowflakeDialect,
};
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
/// SQL Token enumeration
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum Token {
/// An end-of-file marker, not a real token
EOF,
/// A keyword (like SELECT) or an optionally quoted SQL identifier
Word(Word),
/// An unsigned numeric literal
Number(String, bool),
/// A character that could not be tokenized
Char(char),
/// Single quoted string: i.e: 'string'
SingleQuotedString(String),
/// Double quoted string: i.e: "string"
DoubleQuotedString(String),
/// Triple single quoted strings: Example '''abc'''
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedString(String),
/// Triple double quoted strings: Example """abc"""
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedString(String),
/// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
DollarQuotedString(DollarQuotedString),
/// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
/// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
SingleQuotedByteStringLiteral(String),
/// Byte string literal: i.e: b"string" or B"string"
DoubleQuotedByteStringLiteral(String),
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedByteStringLiteral(String),
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedByteStringLiteral(String),
/// Single quoted literal with raw string prefix. Example `R'abc'`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
SingleQuotedRawStringLiteral(String),
/// Double quoted literal with raw string prefix. Example `R"abc"`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
DoubleQuotedRawStringLiteral(String),
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedRawStringLiteral(String),
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedRawStringLiteral(String),
/// "National" string literal: i.e: N'string'
NationalStringLiteral(String),
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
EscapedStringLiteral(String),
/// Unicode string literal: i.e: U&'first \000A second'
UnicodeStringLiteral(String),
/// Hexadecimal string literal: i.e.: X'deadbeef'
HexStringLiteral(String),
/// Comma
Comma,
/// Whitespace (space, tab, etc)
Whitespace(Whitespace),
/// Double equals sign `==`
DoubleEq,
/// Equality operator `=`
Eq,
/// Not Equals operator `<>` (or `!=` in some dialects)
Neq,
/// Less Than operator `<`
Lt,
/// Greater Than operator `>`
Gt,
/// Less Than Or Equals operator `<=`
LtEq,
/// Greater Than Or Equals operator `>=`
GtEq,
/// Spaceship operator <=>
Spaceship,
/// Plus operator `+`
Plus,
/// Minus operator `-`
Minus,
/// Multiplication operator `*`
Mul,
/// Division operator `/`
Div,
/// Integer division operator `//` in DuckDB
DuckIntDiv,
/// Modulo Operator `%`
Mod,
/// String concatenation `||`
StringConcat,
/// Left parenthesis `(`
LParen,
/// Right parenthesis `)`
RParen,
/// Period (used for compound identifiers or projections into nested types)
Period,
/// Colon `:`
Colon,
/// DoubleColon `::` (used for casting in PostgreSQL)
DoubleColon,
/// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
Assignment,
/// SemiColon `;` used as separator for COPY and payload
SemiColon,
/// Backslash `\` used in terminating the COPY payload with `\.`
Backslash,
/// Left bracket `[`
LBracket,
/// Right bracket `]`
RBracket,
/// Ampersand `&`
Ampersand,
/// Pipe `|`
Pipe,
/// Caret `^`
Caret,
/// Left brace `{`
LBrace,
/// Right brace `}`
RBrace,
/// Right Arrow `=>`
RArrow,
/// Sharp `#` used for PostgreSQL Bitwise XOR operator
Sharp,
/// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
Tilde,
/// `~*` , a case insensitive match regular expression operator in PostgreSQL
TildeAsterisk,
/// `!~` , a case sensitive not match regular expression operator in PostgreSQL
ExclamationMarkTilde,
/// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
ExclamationMarkTildeAsterisk,
/// `~~`, a case sensitive match pattern operator in PostgreSQL
DoubleTilde,
/// `~~*`, a case insensitive match pattern operator in PostgreSQL
DoubleTildeAsterisk,
/// `!~~`, a case sensitive not match pattern operator in PostgreSQL
ExclamationMarkDoubleTilde,
/// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
ExclamationMarkDoubleTildeAsterisk,
/// `<<`, a bitwise shift left operator in PostgreSQL
ShiftLeft,
/// `>>`, a bitwise shift right operator in PostgreSQL
ShiftRight,
/// `&&`, an overlap operator in PostgreSQL
Overlap,
/// Exclamation Mark `!` used for PostgreSQL factorial operator
ExclamationMark,
/// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
DoubleExclamationMark,
/// AtSign `@` used for PostgreSQL abs operator
AtSign,
/// `^@`, a "starts with" string operator in PostgreSQL
CaretAt,
/// `|/`, a square root math operator in PostgreSQL
PGSquareRoot,
/// `||/`, a cube root math operator in PostgreSQL
PGCubeRoot,
/// `?` or `$` , a prepared statement arg placeholder
Placeholder(String),
/// `->`, used as a operator to extract json field in PostgreSQL
Arrow,
/// `->>`, used as a operator to extract json field as text in PostgreSQL
LongArrow,
/// `#>`, extracts JSON sub-object at the specified path
HashArrow,
/// `#>>`, extracts JSON sub-object at the specified path as text
HashLongArrow,
/// jsonb @> jsonb -> boolean: Test whether left json contains the right json
AtArrow,
/// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
ArrowAt,
/// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
/// path, where path elements can be either field keys or array indexes.
HashMinus,
/// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
/// JSON value?
AtQuestion,
/// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
/// for the specified JSON value. Only the first item of the result is taken into
/// account. If the result is not Boolean, then NULL is returned.
AtAt,
/// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
/// jsonb object
Question,
/// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
/// keys within the jsonb object
QuestionAnd,
/// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
/// keys within the jsonb object
QuestionPipe,
/// Custom binary operator
/// This is used to represent any custom binary operator that is not part of the SQL standard.
/// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
CustomBinaryOperator(String),
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Token::EOF => f.write_str("EOF"),
Token::Word(ref w) => write!(f, "{w}"),
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
Token::Char(ref c) => write!(f, "{c}"),
Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{ws}"),
Token::DoubleEq => f.write_str("=="),
Token::Spaceship => f.write_str("<=>"),
Token::Eq => f.write_str("="),
Token::Neq => f.write_str("<>"),
Token::Lt => f.write_str("<"),
Token::Gt => f.write_str(">"),
Token::LtEq => f.write_str("<="),
Token::GtEq => f.write_str(">="),
Token::Plus => f.write_str("+"),
Token::Minus => f.write_str("-"),
Token::Mul => f.write_str("*"),
Token::Div => f.write_str("/"),
Token::DuckIntDiv => f.write_str("//"),
Token::StringConcat => f.write_str("||"),
Token::Mod => f.write_str("%"),
Token::LParen => f.write_str("("),
Token::RParen => f.write_str(")"),
Token::Period => f.write_str("."),
Token::Colon => f.write_str(":"),
Token::DoubleColon => f.write_str("::"),
Token::Assignment => f.write_str(":="),
Token::SemiColon => f.write_str(";"),
Token::Backslash => f.write_str("\\"),
Token::LBracket => f.write_str("["),
Token::RBracket => f.write_str("]"),
Token::Ampersand => f.write_str("&"),
Token::Caret => f.write_str("^"),
Token::Pipe => f.write_str("|"),
Token::LBrace => f.write_str("{"),
Token::RBrace => f.write_str("}"),
Token::RArrow => f.write_str("=>"),
Token::Sharp => f.write_str("#"),
Token::ExclamationMark => f.write_str("!"),
Token::DoubleExclamationMark => f.write_str("!!"),
Token::Tilde => f.write_str("~"),
Token::TildeAsterisk => f.write_str("~*"),
Token::ExclamationMarkTilde => f.write_str("!~"),
Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
Token::DoubleTilde => f.write_str("~~"),
Token::DoubleTildeAsterisk => f.write_str("~~*"),
Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
Token::AtSign => f.write_str("@"),
Token::CaretAt => f.write_str("^@"),
Token::ShiftLeft => f.write_str("<<"),
Token::ShiftRight => f.write_str(">>"),
Token::Overlap => f.write_str("&&"),
Token::PGSquareRoot => f.write_str("|/"),
Token::PGCubeRoot => f.write_str("||/"),
Token::Placeholder(ref s) => write!(f, "{s}"),
Token::Arrow => write!(f, "->"),
Token::LongArrow => write!(f, "->>"),
Token::HashArrow => write!(f, "#>"),
Token::HashLongArrow => write!(f, "#>>"),
Token::AtArrow => write!(f, "@>"),
Token::ArrowAt => write!(f, "<@"),
Token::HashMinus => write!(f, "#-"),
Token::AtQuestion => write!(f, "@?"),
Token::AtAt => write!(f, "@@"),
Token::Question => write!(f, "?"),
Token::QuestionAnd => write!(f, "?&"),
Token::QuestionPipe => write!(f, "?|"),
Token::CustomBinaryOperator(s) => f.write_str(s),
}
}
}
impl Token {
pub fn make_keyword(keyword: &str) -> Self {
Token::make_word(keyword, None)
}
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
Token::Word(Word {
value: word.to_string(),
quote_style,
keyword: if quote_style.is_none() {
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
} else {
Keyword::NoKeyword
},
})
}
}
/// A keyword (like SELECT) or an optionally quoted SQL identifier
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct Word {
/// The value of the token, without the enclosing quotes, and with the
/// escape sequences (if any) processed (TODO: escapes are not handled)
pub value: String,
/// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
/// The standard and most implementations allow using double quotes for this,
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
pub quote_style: Option<char>,
/// If the word was not quoted and it matched one of the known keywords,
/// this will have one of the values from dialect::keywords, otherwise empty
pub keyword: Keyword,
}
impl fmt::Display for Word {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.quote_style {
Some(s) if s == '"' || s == '[' || s == '`' => {
write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
}
None => f.write_str(&self.value),
_ => panic!("Unexpected quote_style!"),
}
}
}
impl Word {
fn matching_end_quote(ch: char) -> char {
match ch {
'"' => '"', // ANSI and most dialects
'[' => ']', // MS SQL
'`' => '`', // MySQL
_ => panic!("unexpected quoting style!"),
}
}
}
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum Whitespace {
Space,
Newline,
Tab,
SingleLineComment { comment: String, prefix: String },
MultiLineComment(String),
}
impl fmt::Display for Whitespace {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Whitespace::Space => f.write_str(" "),
Whitespace::Newline => f.write_str("\n"),
Whitespace::Tab => f.write_str("\t"),
Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
}
}
}
/// Location in input string
#[derive(Debug, Eq, PartialEq, Clone, Copy)]
pub struct Location {
/// Line number, starting from 1
pub line: u64,
/// Line column, starting from 1
pub column: u64,
}
impl fmt::Display for Location {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.line == 0 {
return Ok(());
}
write!(
f,
// TODO: use standard compiler location syntax (<path>:<line>:<col>)
" at Line: {}, Column: {}",
self.line, self.column,
)
}
}
/// A [Token] with [Location] attached to it
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct TokenWithLocation {
pub token: Token,
pub location: Location,
}
impl TokenWithLocation {
pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
TokenWithLocation {
token,
location: Location { line, column },
}
}
pub fn wrap(token: Token) -> TokenWithLocation {
TokenWithLocation::new(token, 0, 0)
}
}
impl PartialEq<Token> for TokenWithLocation {
fn eq(&self, other: &Token) -> bool {
&self.token == other
}
}
impl PartialEq<TokenWithLocation> for Token {
fn eq(&self, other: &TokenWithLocation) -> bool {
self == &other.token
}
}
impl fmt::Display for TokenWithLocation {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.token.fmt(f)
}
}
/// Tokenizer error
#[derive(Debug, PartialEq, Eq)]
pub struct TokenizerError {
pub message: String,
pub location: Location,
}
impl fmt::Display for TokenizerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}{}", self.message, self.location,)
}
}
#[cfg(feature = "std")]
impl std::error::Error for TokenizerError {}
struct State<'a> {
peekable: Peekable<Chars<'a>>,
pub line: u64,
pub col: u64,
}
impl<'a> State<'a> {
/// return the next character and advance the stream
pub fn next(&mut self) -> Option<char> {
match self.peekable.next() {
None => None,
Some(s) => {
if s == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
Some(s)
}
}
}
/// return the next character but do not advance the stream
pub fn peek(&mut self) -> Option<&char> {
self.peekable.peek()
}
pub fn location(&self) -> Location {
Location {
line: self.line,
column: self.col,
}
}
}
/// Represents how many quote characters enclose a string literal.
#[derive(Copy, Clone)]
enum NumStringQuoteChars {
/// e.g. `"abc"`, `'abc'`, `r'abc'`
One,
/// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
Many(NonZeroU8),
}
/// Settings for tokenizing a quoted string literal.
struct TokenizeQuotedStringSettings {
/// The character used to quote the string.
quote_style: char,
/// Represents how many quotes characters enclose the string literal.
num_quote_chars: NumStringQuoteChars,
/// The number of opening quotes left to consume, before parsing
/// the remaining string literal.
/// For example: given initial string `"""abc"""`. If the caller has
/// already parsed the first quote for some reason, then this value
/// is set to 1, flagging to look to consume only 2 leading quotes.
num_opening_quotes_to_consume: u8,
/// True if the string uses backslash escaping of special characters
/// e.g `'abc\ndef\'ghi'
backslash_escape: bool,
}
/// SQL Tokenizer
pub struct Tokenizer<'a> {
dialect: &'a dyn Dialect,
query: &'a str,
/// If true (the default), the tokenizer will un-escape literal
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
unescape: bool,
}
impl<'a> Tokenizer<'a> {
/// Create a new SQL tokenizer for the specified SQL statement
///
/// ```
/// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
/// # use sqlparser::dialect::GenericDialect;
/// # let dialect = GenericDialect{};
/// let query = r#"SELECT 'foo'"#;
///
/// // Parsing the query
/// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
///
/// assert_eq!(tokens, vec![
/// Token::make_word("SELECT", None),
/// Token::Whitespace(Whitespace::Space),
/// Token::SingleQuotedString("foo".to_string()),
/// ]);
pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
Self {
dialect,
query,
unescape: true,
}
}
/// Set unescape mode
///
/// When true (default) the tokenizer unescapes literal values
/// (for example, `""` in SQL is unescaped to the literal `"`).
///
/// When false, the tokenizer provides the raw strings as provided
/// in the query. This can be helpful for programs that wish to
/// recover the *exact* original query text without normalizing
/// the escaping
///
/// # Example
///
/// ```
/// # use sqlparser::tokenizer::{Token, Tokenizer};
/// # use sqlparser::dialect::GenericDialect;
/// # let dialect = GenericDialect{};
/// let query = r#""Foo "" Bar""#;
/// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
/// let original = Token::make_word(r#"Foo "" Bar"#, Some('"'));
///
/// // Parsing with unescaping (default)
/// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
/// assert_eq!(tokens, vec![unescaped]);
///
/// // Parsing with unescape = false
/// let tokens = Tokenizer::new(&dialect, &query)
/// .with_unescape(false)
/// .tokenize().unwrap();
/// assert_eq!(tokens, vec![original]);
/// ```
pub fn with_unescape(mut self, unescape: bool) -> Self {
self.unescape = unescape;
self
}
/// Tokenize the statement and produce a vector of tokens
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
let twl = self.tokenize_with_location()?;
Ok(twl.into_iter().map(|t| t.token).collect())
}
/// Tokenize the statement and produce a vector of tokens with location information
pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
let mut tokens: Vec<TokenWithLocation> = vec![];
self.tokenize_with_location_into_buf(&mut tokens)
.map(|_| tokens)
}
/// Tokenize the statement and append tokens with location information into the provided buffer.
/// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
pub fn tokenize_with_location_into_buf(
&mut self,
buf: &mut Vec<TokenWithLocation>,
) -> Result<(), TokenizerError> {
let mut state = State {
peekable: self.query.chars().peekable(),
line: 1,
col: 1,
};
let mut location = state.location();
while let Some(token) = self.next_token(&mut state)? {
buf.push(TokenWithLocation { token, location });
location = state.location();
}
Ok(())
}
// Tokenize the identifier or keywords in `ch`
fn tokenize_identifier_or_keyword(
&self,
ch: impl IntoIterator<Item = char>,
chars: &mut State,
) -> Result<Option<Token>, TokenizerError> {
chars.next(); // consume the first char
let ch: String = ch.into_iter().collect();
let word = self.tokenize_word(ch, chars);
// TODO: implement parsing of exponent here
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
let mut inner_state = State {
peekable: word.chars().peekable(),
line: 0,
col: 0,
};
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
s += s2.as_str();
return Ok(Some(Token::Number(s, false)));
}
Ok(Some(Token::make_word(&word, None)))
}
/// Get the next token or return None
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Some(&ch) => match ch {
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
'\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
'\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
'\r' => {
// Emit a single Whitespace::Newline token for \r and \r\n
chars.next();
if let Some('\n') = chars.peek() {
chars.next();
}
Ok(Some(Token::Whitespace(Whitespace::Newline)))
}
// BigQuery uses b or B for byte string literal
b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedByteStringLiteral,
Token::TripleSingleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedByteStringLiteral,
Token::TripleDoubleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
// regular identifier starting with an "b" or "B"
let s = self.tokenize_word(b, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// BigQuery uses r or R for raw string literal
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedRawStringLiteral,
Token::TripleSingleQuotedRawStringLiteral,
),
Some('\"') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedRawStringLiteral,
Token::TripleDoubleQuotedRawStringLiteral,
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// Redshift uses lower case n for national string literal
n @ 'N' | n @ 'n' => {
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
x @ 'e' | x @ 'E' => {
let starting_loc = chars.location();
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
let s =
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
Ok(Some(Token::EscapedStringLiteral(s)))
}
_ => {
// regular identifier starting with an "E" or "e"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
chars.next(); // consume, to check the next char
if chars.peek() == Some(&'&') {
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
let mut chars_clone = chars.peekable.clone();
chars_clone.next(); // consume the '&' in the clone
if chars_clone.peek() == Some(&'\'') {
chars.next(); // consume the '&' in the original iterator
let s = unescape_unicode_single_quoted_string(chars)?;
return Ok(Some(Token::UnicodeStringLiteral(s)));
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
x @ 'x' | x @ 'X' => {
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// single quoted string
'\'' => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
Token::SingleQuotedString,
Token::TripleSingleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::SingleQuotedString(s)))
}
// double quoted string
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
Token::DoubleQuotedString,
Token::TripleDoubleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::DoubleQuotedString(s)))
}
// delimited (quoted) identifier
quote_start
if self.dialect.is_delimited_identifier_start(ch)
&& self
.dialect
.is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
{
let error_loc = chars.location();
chars.next(); // consume the opening quote
let quote_end = Word::matching_end_quote(quote_start);
let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
if last_char == Some(quote_end) {
Ok(Some(Token::make_word(&s, Some(quote_start))))
} else {
self.tokenizer_error(
error_loc,
format!("Expected close delimiter '{quote_end}' before EOF."),
)
}
}
// numbers and period
'0'..='9' | '.' => {
let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
// match binary literal that starts with 0x
if s == "0" && chars.peek() == Some(&'x') {
chars.next();
let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
return Ok(Some(Token::HexStringLiteral(s2)));
}
// match one period
if let Some('.') = chars.peek() {
s.push('.');
chars.next();
}
s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
// No number -> Token::Period
if s == "." {
return Ok(Some(Token::Period));
}
let mut exponent_part = String::new();
// Parse exponent as number
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
let mut char_clone = chars.peekable.clone();
exponent_part.push(char_clone.next().unwrap());
// Optional sign
match char_clone.peek() {
Some(&c) if matches!(c, '+' | '-') => {
exponent_part.push(c);
char_clone.next();
}
_ => (),
}
match char_clone.peek() {
// Definitely an exponent, get original iterator up to speed and use it
Some(&c) if c.is_ascii_digit() => {
for _ in 0..exponent_part.len() {
chars.next();
}
exponent_part +=
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
s += exponent_part.as_str();
}
// Not an exponent, discard the work done
_ => (),
}
}
// mysql dialect supports identifiers that start with a numeric prefix,
// as long as they aren't an exponent number.
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
let word =
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
if !word.is_empty() {
s += word.as_str();
return Ok(Some(Token::make_word(s.as_str(), None)));
}
}
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};
Ok(Some(Token::Number(s, long)))
}
// punctuation
'(' => self.consume_and_return(chars, Token::LParen),
')' => self.consume_and_return(chars, Token::RParen),
',' => self.consume_and_return(chars, Token::Comma),
// operators
'-' => {
chars.next(); // consume the '-'
match chars.peek() {
Some('-') => {
chars.next(); // consume the second '-', starting a single-line comment
let comment = self.tokenize_single_line_comment(chars);
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_owned(),
comment,
})))
}
Some('>') => {
chars.next();
match chars.peek() {
Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
_ => self.start_binop(chars, "->", Token::Arrow),
}
}
// a regular '-' operator
_ => self.start_binop(chars, "-", Token::Minus),
}
}
'/' => {
chars.next(); // consume the '/'
match chars.peek() {
Some('*') => {
chars.next(); // consume the '*', starting a multi-line comment
self.tokenize_multiline_comment(chars)
}
Some('/') if dialect_of!(self is SnowflakeDialect) => {
chars.next(); // consume the second '/', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars);
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "//".to_owned(),
comment,
})))
}
Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
self.consume_and_return(chars, Token::DuckIntDiv)
}
// a regular '/' operator
_ => Ok(Some(Token::Div)),
}
}
'+' => self.consume_and_return(chars, Token::Plus),
'*' => self.consume_and_return(chars, Token::Mul),
'%' => {
chars.next(); // advance past '%'
match chars.peek() {
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
Some(sch) if self.dialect.is_identifier_start('%') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => self.start_binop(chars, "%", Token::Mod),
}
}
'|' => {
chars.next(); // consume the '|'
match chars.peek() {
Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
Some('|') => {
chars.next(); // consume the second '|'
match chars.peek() {
Some('/') => {
self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
}
_ => self.start_binop(chars, "||", Token::StringConcat),
}
}
// Bitshift '|' operator
_ => self.start_binop(chars, "|", Token::Pipe),
}
}
'=' => {
chars.next(); // consume
match chars.peek() {
Some('>') => self.consume_and_return(chars, Token::RArrow),
Some('=') => self.consume_and_return(chars, Token::DoubleEq),
_ => Ok(Some(Token::Eq)),
}
}
'!' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::Neq),
Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => self
.consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => self.consume_and_return(
chars,
Token::ExclamationMarkDoubleTildeAsterisk,
),
_ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
}
}
_ => Ok(Some(Token::ExclamationMarkTilde)),
}
}
_ => Ok(Some(Token::ExclamationMark)),
}
}
'<' => {
chars.next(); // consume
match chars.peek() {
Some('=') => {
chars.next();
match chars.peek() {
Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
_ => self.start_binop(chars, "<=", Token::LtEq),
}
}
Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
_ => self.start_binop(chars, "<", Token::Lt),
}
}
'>' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
_ => self.start_binop(chars, ">", Token::Gt),
}
}
':' => {
chars.next();
match chars.peek() {
Some(':') => self.consume_and_return(chars, Token::DoubleColon),
Some('=') => self.consume_and_return(chars, Token::Assignment),
_ => Ok(Some(Token::Colon)),
}
}
';' => self.consume_and_return(chars, Token::SemiColon),
'\\' => self.consume_and_return(chars, Token::Backslash),
'[' => self.consume_and_return(chars, Token::LBracket),
']' => self.consume_and_return(chars, Token::RBracket),
'&' => {
chars.next(); // consume the '&'
match chars.peek() {
Some('&') => {
chars.next(); // consume the second '&'
self.start_binop(chars, "&&", Token::Overlap)
}
// Bitshift '&' operator
_ => self.start_binop(chars, "&", Token::Ampersand),
}
}
'^' => {
chars.next(); // consume the '^'
match chars.peek() {
Some('@') => self.consume_and_return(chars, Token::CaretAt),
_ => Ok(Some(Token::Caret)),
}
}
'{' => self.consume_and_return(chars, Token::LBrace),
'}' => self.consume_and_return(chars, Token::RBrace),
'#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect) => {
chars.next(); // consume the '#', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars);
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "#".to_owned(),
comment,
})))
}
'~' => {
chars.next(); // consume
match chars.peek() {
Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => {
self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
}
_ => self.start_binop(chars, "~~", Token::DoubleTilde),
}
}
_ => self.start_binop(chars, "~", Token::Tilde),
}
}
'#' => {
chars.next();
match chars.peek() {
Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
Some('>') => {
chars.next();
match chars.peek() {
Some('>') => {
self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
}
_ => self.start_binop(chars, "#>", Token::HashArrow),
}
}
Some(' ') => Ok(Some(Token::Sharp)),
Some(sch) if self.dialect.is_identifier_start('#') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => self.start_binop(chars, "#", Token::Sharp),
}
}
'@' => {
chars.next();
match chars.peek() {
Some('>') => self.consume_and_return(chars, Token::AtArrow),
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
Some('@') => {
chars.next();
match chars.peek() {
Some(' ') => Ok(Some(Token::AtAt)),
Some(tch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
}
_ => Ok(Some(Token::AtAt)),
}
}
Some(' ') => Ok(Some(Token::AtSign)),
Some(sch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => Ok(Some(Token::AtSign)),
}
}
// Postgres uses ? for jsonb operators, not prepared statements
'?' if dialect_of!(self is PostgreSqlDialect) => {
chars.next();
match chars.peek() {
Some('|') => self.consume_and_return(chars, Token::QuestionPipe),
Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
_ => self.consume_and_return(chars, Token::Question),
}
}
'?' => {
chars.next();
let s = peeking_take_while(chars, |ch| ch.is_numeric());
Ok(Some(Token::Placeholder(String::from("?") + &s)))
}
// identifier or keyword
ch if self.dialect.is_identifier_start(ch) => {
self.tokenize_identifier_or_keyword([ch], chars)
}
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
//whitespace check (including unicode chars) should be last as it covers some of the chars above
ch if ch.is_whitespace() => {
self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
}
other => self.consume_and_return(chars, Token::Char(other)),
},
None => Ok(None),
}
}
/// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
fn consume_for_binop(
&self,
chars: &mut State,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
chars.next(); // consume the first char
self.start_binop(chars, prefix, default)
}
/// parse a custom binary operator
fn start_binop(
&self,
chars: &mut State,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
let mut custom = None;
while let Some(&ch) = chars.peek() {
if !self.dialect.is_custom_operator_part(ch) {
break;
}
custom.get_or_insert_with(|| prefix.to_string()).push(ch);
chars.next();
}
Ok(Some(
custom.map(Token::CustomBinaryOperator).unwrap_or(default),
))
}
/// Tokenize dollar preceded value (i.e: a string/placeholder)
fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
let mut s = String::new();
let mut value = String::new();
chars.next();
if let Some('$') = chars.peek() {
chars.next();
let mut is_terminated = false;
let mut prev: Option<char> = None;
while let Some(&ch) = chars.peek() {
if prev == Some('$') {
if ch == '$' {
chars.next();
is_terminated = true;
break;
} else {
s.push('$');
s.push(ch);
}
} else if ch != '$' {
s.push(ch);
}
prev = Some(ch);
chars.next();
}
return if chars.peek().is_none() && !is_terminated {
self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
} else {
Ok(Token::DollarQuotedString(DollarQuotedString {
value: s,
tag: None,
}))
};
} else {
value.push_str(&peeking_take_while(chars, |ch| {
ch.is_alphanumeric() || ch == '_'
}));
if let Some('$') = chars.peek() {
chars.next();
'searching_for_end: loop {
s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
match chars.peek() {
Some('$') => {
chars.next();
let mut maybe_s = String::from("$");
for c in value.chars() {
if let Some(next_char) = chars.next() {
maybe_s.push(next_char);
if next_char != c {
// This doesn't match the dollar quote delimiter so this
// is not the end of the string.
s.push_str(&maybe_s);
continue 'searching_for_end;
}
} else {
return self.tokenizer_error(
chars.location(),
"Unterminated dollar-quoted, expected $",
);
}
}
if chars.peek() == Some(&'$') {
chars.next();
maybe_s.push('$');
// maybe_s matches the end delimiter
break 'searching_for_end;
} else {
// This also doesn't match the dollar quote delimiter as there are
// more characters before the second dollar so this is not the end
// of the string.
s.push_str(&maybe_s);
continue 'searching_for_end;
}
}
_ => {
return self.tokenizer_error(
chars.location(),
"Unterminated dollar-quoted, expected $",
)
}
}
}
} else {
return Ok(Token::Placeholder(String::from("$") + &value));
}
}
Ok(Token::DollarQuotedString(DollarQuotedString {
value: s,
tag: if value.is_empty() { None } else { Some(value) },
}))
}
fn tokenizer_error<R>(
&self,
loc: Location,
message: impl Into<String>,
) -> Result<R, TokenizerError> {
Err(TokenizerError {
message: message.into(),
location: loc,
})
}
// Consume characters until newline
fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
let mut comment = peeking_take_while(chars, |ch| ch != '\n');
if let Some(ch) = chars.next() {
assert_eq!(ch, '\n');
comment.push(ch);
}
comment
}
/// Tokenize an identifier or keyword, after the first char is already consumed.
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
let mut s = first_chars.into();
s.push_str(&peeking_take_while(chars, |ch| {
self.dialect.is_identifier_part(ch)
}));
s
}
/// Read a single quoted string, starting with the opening quote.
fn tokenize_escaped_single_quoted_string(
&self,
starting_loc: Location,
chars: &mut State,
) -> Result<String, TokenizerError> {
if let Some(s) = unescape_single_quoted_string(chars) {
return Ok(s);
}
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
}
/// Reads a string literal quoted by a single or triple quote characters.
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
fn tokenize_single_or_triple_quoted_string<F>(
&self,
chars: &mut State,
quote_style: char,
backslash_escape: bool,
single_quote_token: F,
triple_quote_token: F,
) -> Result<Option<Token>, TokenizerError>
where
F: Fn(String) -> Token,
{
let error_loc = chars.location();
let mut num_opening_quotes = 0u8;
for _ in 0..3 {
if Some(&quote_style) == chars.peek() {
chars.next(); // Consume quote.
num_opening_quotes += 1;
} else {
break;
}
}
let (token_fn, num_quote_chars) = match num_opening_quotes {
1 => (single_quote_token, NumStringQuoteChars::One),
2 => {
// If we matched double quotes, then this is an empty string.
return Ok(Some(single_quote_token("".into())));
}
3 => {
let Some(num_quote_chars) = NonZeroU8::new(3) else {
return self.tokenizer_error(error_loc, "invalid number of opening quotes");
};
(
triple_quote_token,
NumStringQuoteChars::Many(num_quote_chars),
)
}
_ => {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
};
let settings = TokenizeQuotedStringSettings {
quote_style,
num_quote_chars,
num_opening_quotes_to_consume: 0,
backslash_escape,
};
self.tokenize_quoted_string(chars, settings)
.map(token_fn)
.map(Some)
}
/// Reads a string literal quoted by a single quote character.
fn tokenize_single_quoted_string(
&self,
chars: &mut State,
quote_style: char,
backslash_escape: bool,
) -> Result<String, TokenizerError> {
self.tokenize_quoted_string(
chars,
TokenizeQuotedStringSettings {
quote_style,
num_quote_chars: NumStringQuoteChars::One,
num_opening_quotes_to_consume: 1,
backslash_escape,
},
)
}
/// Read a quoted string.
fn tokenize_quoted_string(
&self,
chars: &mut State,
settings: TokenizeQuotedStringSettings,
) -> Result<String, TokenizerError> {
let mut s = String::new();
let error_loc = chars.location();
// Consume any opening quotes.
for _ in 0..settings.num_opening_quotes_to_consume {
if Some(settings.quote_style) != chars.next() {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
}
let mut num_consecutive_quotes = 0;
while let Some(&ch) = chars.peek() {
let pending_final_quote = match settings.num_quote_chars {
NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
n @ NumStringQuoteChars::Many(count)
if num_consecutive_quotes + 1 == count.get() =>
{
Some(n)
}
NumStringQuoteChars::Many(_) => None,
};
match ch {
char if char == settings.quote_style && pending_final_quote.is_some() => {
chars.next(); // consume
if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
// For an initial string like `"""abc"""`, at this point we have
// `abc""` in the buffer and have now matched the final `"`.
// However, the string to return is simply `abc`, so we strip off
// the trailing quotes before returning.
let mut buf = s.chars();
for _ in 1..count.get() {
buf.next_back();
}
return Ok(buf.as_str().to_string());
} else if chars
.peek()
.map(|c| *c == settings.quote_style)
.unwrap_or(false)
{
s.push(ch);
if !self.unescape {
// In no-escape mode, the given query has to be saved completely
s.push(ch);
}
chars.next();
} else {
return Ok(s);
}
}
'\\' if settings.backslash_escape => {
// consume backslash
chars.next();
num_consecutive_quotes = 0;
if let Some(next) = chars.peek() {
if !self.unescape {
// In no-escape mode, the given query has to be saved completely including backslashes.
s.push(ch);
s.push(*next);
chars.next(); // consume next
} else {
let n = match next {
'0' => '\0',
'a' => '\u{7}',
'b' => '\u{8}',
'f' => '\u{c}',
'n' => '\n',
'r' => '\r',
't' => '\t',
'Z' => '\u{1a}',
_ => *next,
};
s.push(n);
chars.next(); // consume next
}
}
}
ch => {
chars.next(); // consume ch
if ch == settings.quote_style {
num_consecutive_quotes += 1;
} else {
num_consecutive_quotes = 0;
}
s.push(ch);
}
}
}
self.tokenizer_error(error_loc, "Unterminated string literal")
}
fn tokenize_multiline_comment(
&self,
chars: &mut State,
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
let mut last_ch = ' ';
loop {
match chars.next() {
Some(ch) => {
if last_ch == '/' && ch == '*' {
nested += 1;
} else if last_ch == '*' && ch == '/' {
nested -= 1;
if nested == 0 {
s.pop();
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
}
}
s.push(ch);
last_ch = ch;
}
None => {
break self.tokenizer_error(
chars.location(),
"Unexpected EOF while in a multi-line comment",
)
}
}
}
}
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
let mut last_char = None;
let mut s = String::new();
while let Some(ch) = chars.next() {
if ch == quote_end {
if chars.peek() == Some(&quote_end) {
chars.next();
s.push(ch);
if !self.unescape {
// In no-escape mode, the given query has to be saved completely
s.push(ch);
}
} else {
last_char = Some(quote_end);
break;
}
} else {
s.push(ch);
}
}
(s, last_char)
}
#[allow(clippy::unnecessary_wraps)]
fn consume_and_return(
&self,
chars: &mut State,
t: Token,
) -> Result<Option<Token>, TokenizerError> {
chars.next();
Ok(Some(t))
}
}
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
/// Return the characters read as String, and keep the first non-matching
/// char available as `chars.next()`.
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
let mut s = String::new();
while let Some(&ch) = chars.peek() {
if predicate(ch) {
chars.next(); // consume
s.push(ch);
} else {
break;
}
}
s
}
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Unescape::new(chars).unescape()
}
struct Unescape<'a: 'b, 'b> {
chars: &'b mut State<'a>,
}
impl<'a: 'b, 'b> Unescape<'a, 'b> {
fn new(chars: &'b mut State<'a>) -> Self {
Self { chars }
}
fn unescape(mut self) -> Option<String> {
let mut unescaped = String::new();
self.chars.next();
while let Some(c) = self.chars.next() {
if c == '\'' {
// case: ''''
if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
self.chars.next();
unescaped.push('\'');
continue;
}
return Some(unescaped);
}
if c != '\\' {
unescaped.push(c);
continue;
}
let c = match self.chars.next()? {
'b' => '\u{0008}',
'f' => '\u{000C}',
'n' => '\n',
'r' => '\r',
't' => '\t',
'u' => self.unescape_unicode_16()?,
'U' => self.unescape_unicode_32()?,
'x' => self.unescape_hex()?,
c if c.is_digit(8) => self.unescape_octal(c)?,
c => c,
};
unescaped.push(Self::check_null(c)?);
}
None
}
#[inline]
fn check_null(c: char) -> Option<char> {
if c == '\0' {
None
} else {
Some(c)
}
}
#[inline]
fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
// u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
match u32::from_str_radix(s, RADIX) {
Err(_) => None,
Ok(n) => {
let n = n & 0xFF;
if n <= 127 {
char::from_u32(n)
} else {
None
}
}
}
}
// Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
fn unescape_hex(&mut self) -> Option<char> {
let mut s = String::new();
for _ in 0..2 {
match self.next_hex_digit() {
Some(c) => s.push(c),
None => break,
}
}
if s.is_empty() {
return Some('x');
}
Self::byte_to_char::<16>(&s)
}
#[inline]
fn next_hex_digit(&mut self) -> Option<char> {
match self.chars.peek() {
Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
_ => None,
}
}
// Octal byte value. \o, \oo, \ooo (o = 0–7)
fn unescape_octal(&mut self, c: char) -> Option<char> {
let mut s = String::new();
s.push(c);
for _ in 0..2 {
match self.next_octal_digest() {
Some(c) => s.push(c),
None => break,
}
}
Self::byte_to_char::<8>(&s)
}
#[inline]
fn next_octal_digest(&mut self) -> Option<char> {
match self.chars.peek() {
Some(c) if c.is_digit(8) => self.chars.next(),
_ => None,
}
}
// 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
fn unescape_unicode_16(&mut self) -> Option<char> {
self.unescape_unicode::<4>()
}
// 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
fn unescape_unicode_32(&mut self) -> Option<char> {
self.unescape_unicode::<8>()
}
fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
let mut s = String::new();
for _ in 0..NUM {
s.push(self.chars.next()?);
}
match u32::from_str_radix(&s, 16) {
Err(_) => None,
Ok(n) => char::from_u32(n),
}
}
}
fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
let mut unescaped = String::new();
chars.next(); // consume the opening quote
while let Some(c) = chars.next() {
match c {
'\'' => {
if chars.peek() == Some(&'\'') {
chars.next();
unescaped.push('\'');
} else {
return Ok(unescaped);
}
}
'\\' => match chars.peek() {
Some('\\') => {
chars.next();
unescaped.push('\\');
}
Some('+') => {
chars.next();
unescaped.push(take_char_from_hex_digits(chars, 6)?);
}
_ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
},
_ => {
unescaped.push(c);
}
}
}
Err(TokenizerError {
message: "Unterminated unicode encoded string literal".to_string(),
location: chars.location(),
})
}
fn take_char_from_hex_digits(
chars: &mut State<'_>,
max_digits: usize,
) -> Result<char, TokenizerError> {
let mut result = 0u32;
for _ in 0..max_digits {
let next_char = chars.next().ok_or_else(|| TokenizerError {
message: "Unexpected EOF while parsing hex digit in escaped unicode string."
.to_string(),
location: chars.location(),
})?;
let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
location: chars.location(),
})?;
result = result * 16 + digit;
}
char::from_u32(result).ok_or_else(|| TokenizerError {
message: format!("Invalid unicode character: {:x}", result),
location: chars.location(),
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dialect::{
BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
};
use core::fmt::Debug;
#[test]
fn tokenizer_error_impl() {
let err = TokenizerError {
message: "test".into(),
location: Location { line: 1, column: 1 },
};
#[cfg(feature = "std")]
{
use std::error::Error;
assert!(err.source().is_none());
}
assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
}
#[test]
fn tokenize_select_1() {
let sql = String::from("SELECT 1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_select_float() {
let sql = String::from("SELECT .1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from(".1"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_clickhouse_double_equal() {
let sql = String::from("SELECT foo=='1'");
let dialect = ClickHouseDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "foo".to_string(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
Token::DoubleEq,
Token::SingleQuotedString("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_select_exponent() {
let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1e10"), false),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1e-10"), false),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1e+10"), false),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
Token::make_word("ea", None),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1e-10"), false),
Token::make_word("a", None),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1e-10"), false),
Token::Minus,
Token::Number(String::from("10"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_scalar_function() {
let sql = String::from("SELECT sqrt(1)");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("sqrt", None),
Token::LParen,
Token::Number(String::from("1"), false),
Token::RParen,
];
compare(expected, tokens);
}
#[test]
fn tokenize_string_string_concat() {
let sql = String::from("SELECT 'a' || 'b'");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("a")),
Token::Whitespace(Whitespace::Space),
Token::StringConcat,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("b")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_bitwise_op() {
let sql = String::from("SELECT one | two ^ three");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("one", None),
Token::Whitespace(Whitespace::Space),
Token::Pipe,
Token::Whitespace(Whitespace::Space),
Token::make_word("two", None),
Token::Whitespace(Whitespace::Space),
Token::Caret,
Token::Whitespace(Whitespace::Space),
Token::make_word("three", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_logical_xor() {
let sql =
String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("false"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("XOR"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("true"),
];
compare(expected, tokens);
}
#[test]
fn tokenize_simple_select() {
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("LIMIT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_explain_select() {
let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("EXPLAIN"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_explain_analyze_select() {
let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("EXPLAIN"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("ANALYZE"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_string_predicate() {
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("salary", None),
Token::Whitespace(Whitespace::Space),
Token::Neq,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("Not Provided")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_invalid_string() {
let sql = String::from("\n💝مصطفىh");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
// println!("tokens: {:#?}", tokens);
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Char('💝'),
Token::make_word("مصطفىh", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_newline_in_string_literal() {
let sql = String::from("'foo\r\nbar\nbaz'");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
compare(expected, tokens);
}
#[test]
fn tokenize_unterminated_string_literal() {
let sql = String::from("select 'foo");
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
assert_eq!(
tokenizer.tokenize(),
Err(TokenizerError {
message: "Unterminated string literal".to_string(),
location: Location { line: 1, column: 8 },
})
);
}
#[test]
fn tokenize_unterminated_string_literal_utf8() {
let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
assert_eq!(
tokenizer.tokenize(),
Err(TokenizerError {
message: "Unterminated string literal".to_string(),
location: Location {
line: 1,
column: 35
}
})
);
}
#[test]
fn tokenize_invalid_string_cols() {
let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
// println!("tokens: {:#?}", tokens);
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::Newline),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("table"),
Token::Whitespace(Whitespace::Tab),
Token::Char('💝'),
Token::make_word("مصطفىh", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_dollar_quoted_string_tagged() {
let sql = String::from(
"SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
);
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::DollarQuotedString(DollarQuotedString {
value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
tag: Some("tag".into()),
}),
];
compare(expected, tokens);
}
#[test]
fn tokenize_dollar_quoted_string_tagged_unterminated() {
let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
let dialect = GenericDialect {};
assert_eq!(
Tokenizer::new(&dialect, &sql).tokenize(),
Err(TokenizerError {
message: "Unterminated dollar-quoted, expected $".into(),
location: Location {
line: 1,
column: 91
}
})
);
}
#[test]
fn tokenize_dollar_quoted_string_untagged() {
let sql =
String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::DollarQuotedString(DollarQuotedString {
value: "within dollar '$' quoted strings have $tags like this$ ".into(),
tag: None,
}),
];
compare(expected, tokens);
}
#[test]
fn tokenize_dollar_quoted_string_untagged_unterminated() {
let sql = String::from(
"SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
);
let dialect = GenericDialect {};
assert_eq!(
Tokenizer::new(&dialect, &sql).tokenize(),
Err(TokenizerError {
message: "Unterminated dollar-quoted string".into(),
location: Location {
line: 1,
column: 86
}
})
);
}
#[test]
fn tokenize_right_arrow() {
let sql = String::from("FUNCTION(key=>value)");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_word("FUNCTION", None),
Token::LParen,
Token::make_word("key", None),
Token::RArrow,
Token::make_word("value", None),
Token::RParen,
];
compare(expected, tokens);
}
#[test]
fn tokenize_is_null() {
let sql = String::from("a IS NULL");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_word("a", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("IS"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("NULL"),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment() {
let sql = String::from("0--this is a comment\n1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\n".to_string(),
}),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment_at_eof() {
let sql = String::from("--this is a comment");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment".to_string(),
})];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment() {
let sql = String::from("0/*multi-line\n* /comment*/1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* /comment".to_string(),
)),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_nested_multiline_comment() {
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
)),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment_with_even_asterisks() {
let sql = String::from("\n/** Comment **/\n");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
#[test]
fn tokenize_unicode_whitespace() {
let sql = String::from(" \u{2003}\n");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
#[test]
fn tokenize_mismatched_quotes() {
let sql = String::from("\"foo");
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
assert_eq!(
tokenizer.tokenize(),
Err(TokenizerError {
message: "Expected close delimiter '\"' before EOF.".to_string(),
location: Location { line: 1, column: 1 },
})
);
}
#[test]
fn tokenize_newlines() {
let sql = String::from("line1\nline2\rline3\r\nline4\r");
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_word("line1", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line2", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line3", None),
Token::Whitespace(Whitespace::Newline),
Token::make_word("line4", None),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
#[test]
fn tokenize_mssql_top() {
let sql = "SELECT TOP 5 [bar] FROM foo";
let dialect = MsSqlDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("TOP"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5"), false),
Token::Whitespace(Whitespace::Space),
Token::make_word("bar", Some('[')),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("foo", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_pg_regex_match() {
let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::Tilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::TildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkTilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkTildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("^a".into()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_pg_like_match() {
let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::DoubleTilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("_a%".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::DoubleTildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("_a%".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkDoubleTilde,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("_a%".into()),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("col", None),
Token::Whitespace(Whitespace::Space),
Token::ExclamationMarkDoubleTildeAsterisk,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("_a%".into()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_quoted_identifier() {
let sql = r#" "a "" b" "a """ "c """"" "#;
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"a " b"#, Some('"')),
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"a ""#, Some('"')),
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"c """#, Some('"')),
Token::Whitespace(Whitespace::Space),
];
compare(expected, tokens);
}
#[test]
fn tokenize_snowflake_div() {
let sql = r#"field/1000"#;
let dialect = SnowflakeDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_word(r#"field"#, None),
Token::Div,
Token::Number("1000".to_string(), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_quoted_identifier_with_no_escape() {
let sql = r#" "a "" b" "a """ "c """"" "#;
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"a "" b"#, Some('"')),
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"a """#, Some('"')),
Token::Whitespace(Whitespace::Space),
Token::make_word(r#"c """""#, Some('"')),
Token::Whitespace(Whitespace::Space),
];
compare(expected, tokens);
}
#[test]
fn tokenize_with_location() {
let sql = "SELECT a,\n b";
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql)
.tokenize_with_location()
.unwrap();
let expected = vec![
TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
TokenWithLocation::new(Token::make_word("a", None), 1, 8),
TokenWithLocation::new(Token::Comma, 1, 9),
TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
TokenWithLocation::new(Token::make_word("b", None), 2, 2),
];
compare(expected, tokens);
}
fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
//println!("------------------------------");
//println!("tokens = {:?}", actual);
//println!("expected = {:?}", expected);
//println!("------------------------------");
assert_eq!(expected, actual);
}
fn check_unescape(s: &str, expected: Option<&str>) {
let s = format!("'{}'", s);
let mut state = State {
peekable: s.chars().peekable(),
line: 0,
col: 0,
};
assert_eq!(
unescape_single_quoted_string(&mut state),
expected.map(|s| s.to_string())
);
}
#[test]
fn test_unescape() {
check_unescape(r"\b", Some("\u{0008}"));
check_unescape(r"\f", Some("\u{000C}"));
check_unescape(r"\t", Some("\t"));
check_unescape(r"\r\n", Some("\r\n"));
check_unescape(r"\/", Some("/"));
check_unescape(r"/", Some("/"));
check_unescape(r"\\", Some("\\"));
// 16 and 32-bit hexadecimal Unicode character value
check_unescape(r"\u0001", Some("\u{0001}"));
check_unescape(r"\u4c91", Some("\u{4c91}"));
check_unescape(r"\u4c916", Some("\u{4c91}6"));
check_unescape(r"\u4c", None);
check_unescape(r"\u0000", None);
check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
check_unescape(r"\U00110000", None);
check_unescape(r"\U00000000", None);
check_unescape(r"\u", None);
check_unescape(r"\U", None);
check_unescape(r"\U1010FFFF", None);
// hexadecimal byte value
check_unescape(r"\x4B", Some("\u{004b}"));
check_unescape(r"\x4", Some("\u{0004}"));
check_unescape(r"\x4L", Some("\u{0004}L"));
check_unescape(r"\x", Some("x"));
check_unescape(r"\xP", Some("xP"));
check_unescape(r"\x0", None);
check_unescape(r"\xCAD", None);
check_unescape(r"\xA9", None);
// octal byte value
check_unescape(r"\1", Some("\u{0001}"));
check_unescape(r"\12", Some("\u{000a}"));
check_unescape(r"\123", Some("\u{0053}"));
check_unescape(r"\1232", Some("\u{0053}2"));
check_unescape(r"\4", Some("\u{0004}"));
check_unescape(r"\45", Some("\u{0025}"));
check_unescape(r"\450", Some("\u{0028}"));
check_unescape(r"\603", None);
check_unescape(r"\0", None);
check_unescape(r"\080", None);
// others
check_unescape(r"\9", Some("9"));
check_unescape(r"''", Some("'"));
check_unescape(
r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
);
check_unescape(r"Hello\0", None);
check_unescape(r"Hello\xCADRust", None);
}
#[test]
fn tokenize_numeric_prefix_trait() {
#[derive(Debug)]
struct NumericPrefixDialect;
impl Dialect for NumericPrefixDialect {
fn is_identifier_start(&self, ch: char) -> bool {
ch.is_ascii_lowercase()
|| ch.is_ascii_uppercase()
|| ch.is_ascii_digit()
|| ch == '$'
}
fn is_identifier_part(&self, ch: char) -> bool {
ch.is_ascii_lowercase()
|| ch.is_ascii_uppercase()
|| ch.is_ascii_digit()
|| ch == '_'
|| ch == '$'
|| ch == '{'
|| ch == '}'
}
fn supports_numeric_prefix(&self) -> bool {
true
}
}
tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
tokenize_numeric_prefix_inner(&HiveDialect {});
tokenize_numeric_prefix_inner(&MySqlDialect {});
}
fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
let sql = r#"SELECT * FROM 1"#;
let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_quoted_string_escape() {
let dialect = SnowflakeDialect {};
for (sql, expected, expected_unescaped) in [
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
(r#"'\\'"#, r#"\\"#, r#"\"#),
(
r#"'\0\a\b\f\n\r\t\Z'"#,
r#"\0\a\b\f\n\r\t\Z"#,
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
),
(r#"'\"'"#, r#"\""#, "\""),
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
] {
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
compare(expected, tokens);
}
for sql in [r#"'\'"#, r#"'ab\'"#] {
let mut tokenizer = Tokenizer::new(&dialect, sql);
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}
// Non-escape dialect
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
compare(expected, tokens);
}
}
#[test]
fn tokenize_triple_quoted_string() {
fn check<F>(
q: char, // The quote character to test
r: char, // An alternate quote character.
quote_token: F,
) where
F: Fn(String) -> Token,
{
let dialect = BigQueryDialect {};
for (sql, expected, expected_unescaped) in [
// Empty string
(format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
// Should not count escaped quote as end of string.
(
format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
format!(r#"ab{q}{q}\{q}{q}cd"#),
format!(r#"ab{q}{q}{q}{q}cd"#),
),
// Simple string
(
format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
"abc".into(),
"abc".into(),
),
// Mix single-double quotes unescaped.
(
format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
),
// Escaped quote.
(
format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
),
// backslash-escaped quote characters.
(
format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
r#"a\'\'b\'c\'d"#.into(),
r#"a''b'c'd"#.into(),
),
// backslash-escaped characters
(
format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
r#"abc\0\n\rdef"#.into(),
"abc\0\n\rdef".into(),
),
] {
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected.to_string())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected_unescaped.to_string())];
compare(expected, tokens);
}
for sql in [
format!(r#"{q}{q}{q}{q}{q}\{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
format!(r#"{q}{q}{q}{q}"#),
format!(r#"{q}{q}{q}{r}{r}"#),
format!(r#"{q}{q}{q}abc{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}"#),
format!(r#"{q}{q}{q}abc"#),
] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}
}
check('"', '\'', Token::TripleDoubleQuotedString);
check('\'', '"', Token::TripleSingleQuotedString);
let dialect = BigQueryDialect {};
let sql = r#"""''"#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::DoubleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string()),
];
compare(expected, tokens);
let sql = r#"''"""#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::SingleQuotedString("".to_string()),
Token::DoubleQuotedString("".to_string()),
];
compare(expected, tokens);
// Non-triple quoted string dialect
let dialect = SnowflakeDialect {};
let sql = r#"''''''"#;
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("''".to_string())];
compare(expected, tokens);
}
}