| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! SQL Tokenizer |
| //! |
| //! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens. |
| //! |
| //! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST). |
| |
| #[cfg(not(feature = "std"))] |
| use alloc::{ |
| borrow::ToOwned, |
| format, |
| string::{String, ToString}, |
| vec, |
| vec::Vec, |
| }; |
| use core::iter::Peekable; |
| use core::num::NonZeroU8; |
| use core::str::Chars; |
| use core::{cmp, fmt}; |
| |
| #[cfg(feature = "serde")] |
| use serde::{Deserialize, Serialize}; |
| |
| #[cfg(feature = "visitor")] |
| use sqlparser_derive::{Visit, VisitMut}; |
| |
| use crate::dialect::Dialect; |
| use crate::dialect::{ |
| BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect, |
| SnowflakeDialect, |
| }; |
| use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX}; |
| use crate::{ast::DollarQuotedString, dialect::HiveDialect}; |
| |
| /// SQL Token enumeration |
| #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub enum Token { |
| /// An end-of-file marker, not a real token |
| EOF, |
| /// A keyword (like SELECT) or an optionally quoted SQL identifier |
| Word(Word), |
| /// An unsigned numeric literal |
| Number(String, bool), |
| /// A character that could not be tokenized |
| Char(char), |
| /// Single quoted string: i.e: 'string' |
| SingleQuotedString(String), |
| /// Double quoted string: i.e: "string" |
| DoubleQuotedString(String), |
| /// Triple single quoted strings: Example '''abc''' |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleSingleQuotedString(String), |
| /// Triple double quoted strings: Example """abc""" |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleDoubleQuotedString(String), |
| /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$ |
| DollarQuotedString(DollarQuotedString), |
| /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as |
| /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101') |
| SingleQuotedByteStringLiteral(String), |
| /// Byte string literal: i.e: b"string" or B"string" |
| DoubleQuotedByteStringLiteral(String), |
| /// Triple single quoted literal with byte string prefix. Example `B'''abc'''` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleSingleQuotedByteStringLiteral(String), |
| /// Triple double quoted literal with byte string prefix. Example `B"""abc"""` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleDoubleQuotedByteStringLiteral(String), |
| /// Single quoted literal with raw string prefix. Example `R'abc'` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| SingleQuotedRawStringLiteral(String), |
| /// Double quoted literal with raw string prefix. Example `R"abc"` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| DoubleQuotedRawStringLiteral(String), |
| /// Triple single quoted literal with raw string prefix. Example `R'''abc'''` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleSingleQuotedRawStringLiteral(String), |
| /// Triple double quoted literal with raw string prefix. Example `R"""abc"""` |
| /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals) |
| TripleDoubleQuotedRawStringLiteral(String), |
| /// "National" string literal: i.e: N'string' |
| NationalStringLiteral(String), |
| /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second' |
| EscapedStringLiteral(String), |
| /// Unicode string literal: i.e: U&'first \000A second' |
| UnicodeStringLiteral(String), |
| /// Hexadecimal string literal: i.e.: X'deadbeef' |
| HexStringLiteral(String), |
| /// Comma |
| Comma, |
| /// Whitespace (space, tab, etc) |
| Whitespace(Whitespace), |
| /// Double equals sign `==` |
| DoubleEq, |
| /// Equality operator `=` |
| Eq, |
| /// Not Equals operator `<>` (or `!=` in some dialects) |
| Neq, |
| /// Less Than operator `<` |
| Lt, |
| /// Greater Than operator `>` |
| Gt, |
| /// Less Than Or Equals operator `<=` |
| LtEq, |
| /// Greater Than Or Equals operator `>=` |
| GtEq, |
| /// Spaceship operator <=> |
| Spaceship, |
| /// Plus operator `+` |
| Plus, |
| /// Minus operator `-` |
| Minus, |
| /// Multiplication operator `*` |
| Mul, |
| /// Division operator `/` |
| Div, |
| /// Integer division operator `//` in DuckDB |
| DuckIntDiv, |
| /// Modulo Operator `%` |
| Mod, |
| /// String concatenation `||` |
| StringConcat, |
| /// Left parenthesis `(` |
| LParen, |
| /// Right parenthesis `)` |
| RParen, |
| /// Period (used for compound identifiers or projections into nested types) |
| Period, |
| /// Colon `:` |
| Colon, |
| /// DoubleColon `::` (used for casting in PostgreSQL) |
| DoubleColon, |
| /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake) |
| Assignment, |
| /// SemiColon `;` used as separator for COPY and payload |
| SemiColon, |
| /// Backslash `\` used in terminating the COPY payload with `\.` |
| Backslash, |
| /// Left bracket `[` |
| LBracket, |
| /// Right bracket `]` |
| RBracket, |
| /// Ampersand `&` |
| Ampersand, |
| /// Pipe `|` |
| Pipe, |
| /// Caret `^` |
| Caret, |
| /// Left brace `{` |
| LBrace, |
| /// Right brace `}` |
| RBrace, |
| /// Right Arrow `=>` |
| RArrow, |
| /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection) |
| Sharp, |
| /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity) |
| DoubleSharp, |
| /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator |
| Tilde, |
| /// `~*` , a case insensitive match regular expression operator in PostgreSQL |
| TildeAsterisk, |
| /// `!~` , a case sensitive not match regular expression operator in PostgreSQL |
| ExclamationMarkTilde, |
| /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL |
| ExclamationMarkTildeAsterisk, |
| /// `~~`, a case sensitive match pattern operator in PostgreSQL |
| DoubleTilde, |
| /// `~~*`, a case insensitive match pattern operator in PostgreSQL |
| DoubleTildeAsterisk, |
| /// `!~~`, a case sensitive not match pattern operator in PostgreSQL |
| ExclamationMarkDoubleTilde, |
| /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL |
| ExclamationMarkDoubleTildeAsterisk, |
| /// `<<`, a bitwise shift left operator in PostgreSQL |
| ShiftLeft, |
| /// `>>`, a bitwise shift right operator in PostgreSQL |
| ShiftRight, |
| /// `&&`, an overlap operator in PostgreSQL |
| Overlap, |
| /// Exclamation Mark `!` used for PostgreSQL factorial operator |
| ExclamationMark, |
| /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator |
| DoubleExclamationMark, |
| /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on) |
| AtSign, |
| /// `^@`, a "starts with" string operator in PostgreSQL |
| CaretAt, |
| /// `|/`, a square root math operator in PostgreSQL |
| PGSquareRoot, |
| /// `||/`, a cube root math operator in PostgreSQL |
| PGCubeRoot, |
| /// `?` or `$` , a prepared statement arg placeholder |
| Placeholder(String), |
| /// `->`, used as a operator to extract json field in PostgreSQL |
| Arrow, |
| /// `->>`, used as a operator to extract json field as text in PostgreSQL |
| LongArrow, |
| /// `#>`, extracts JSON sub-object at the specified path |
| HashArrow, |
| /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference) |
| AtDashAt, |
| /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?) |
| QuestionMarkDash, |
| /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?) |
| AmpersandLeftAngleBracket, |
| /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)` |
| AmpersandRightAngleBracket, |
| /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)` |
| AmpersandLeftAngleBracketVerticalBar, |
| /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)` |
| VerticalBarAmpersandRightAngleBracket, |
| /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between) |
| TwoWayArrow, |
| /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?) |
| LeftAngleBracketCaret, |
| /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?) |
| RightAngleBracketCaret, |
| /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps) |
| QuestionMarkSharp, |
| /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?) |
| QuestionMarkDashVerticalBar, |
| /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?) |
| QuestionMarkDoubleVerticalBar, |
| /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as) |
| TildeEqual, |
| /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?) |
| ShiftLeftVerticalBar, |
| /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?) |
| VerticalBarShiftRight, |
| /// `|> BigQuery pipe operator |
| VerticalBarRightAngleBracket, |
| /// `#>>`, extracts JSON sub-object at the specified path as text |
| HashLongArrow, |
| /// jsonb @> jsonb -> boolean: Test whether left json contains the right json |
| AtArrow, |
| /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json |
| ArrowAt, |
| /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified |
| /// path, where path elements can be either field keys or array indexes. |
| HashMinus, |
| /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified |
| /// JSON value? |
| AtQuestion, |
| /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check |
| /// for the specified JSON value. Only the first item of the result is taken into |
| /// account. If the result is not Boolean, then NULL is returned. |
| AtAt, |
| /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the |
| /// jsonb object |
| Question, |
| /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level |
| /// keys within the jsonb object |
| QuestionAnd, |
| /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level |
| /// keys within the jsonb object |
| QuestionPipe, |
| /// Custom binary operator |
| /// This is used to represent any custom binary operator that is not part of the SQL standard. |
| /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR. |
| CustomBinaryOperator(String), |
| } |
| |
| impl fmt::Display for Token { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match self { |
| Token::EOF => f.write_str("EOF"), |
| Token::Word(ref w) => write!(f, "{w}"), |
| Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }), |
| Token::Char(ref c) => write!(f, "{c}"), |
| Token::SingleQuotedString(ref s) => write!(f, "'{s}'"), |
| Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"), |
| Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""), |
| Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""), |
| Token::DollarQuotedString(ref s) => write!(f, "{s}"), |
| Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"), |
| Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"), |
| Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"), |
| Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"), |
| Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"), |
| Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"), |
| Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""), |
| Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""), |
| Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"), |
| Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""), |
| Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"), |
| Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""), |
| Token::Comma => f.write_str(","), |
| Token::Whitespace(ws) => write!(f, "{ws}"), |
| Token::DoubleEq => f.write_str("=="), |
| Token::Spaceship => f.write_str("<=>"), |
| Token::Eq => f.write_str("="), |
| Token::Neq => f.write_str("<>"), |
| Token::Lt => f.write_str("<"), |
| Token::Gt => f.write_str(">"), |
| Token::LtEq => f.write_str("<="), |
| Token::GtEq => f.write_str(">="), |
| Token::Plus => f.write_str("+"), |
| Token::Minus => f.write_str("-"), |
| Token::Mul => f.write_str("*"), |
| Token::Div => f.write_str("/"), |
| Token::DuckIntDiv => f.write_str("//"), |
| Token::StringConcat => f.write_str("||"), |
| Token::Mod => f.write_str("%"), |
| Token::LParen => f.write_str("("), |
| Token::RParen => f.write_str(")"), |
| Token::Period => f.write_str("."), |
| Token::Colon => f.write_str(":"), |
| Token::DoubleColon => f.write_str("::"), |
| Token::Assignment => f.write_str(":="), |
| Token::SemiColon => f.write_str(";"), |
| Token::Backslash => f.write_str("\\"), |
| Token::LBracket => f.write_str("["), |
| Token::RBracket => f.write_str("]"), |
| Token::Ampersand => f.write_str("&"), |
| Token::Caret => f.write_str("^"), |
| Token::Pipe => f.write_str("|"), |
| Token::LBrace => f.write_str("{"), |
| Token::RBrace => f.write_str("}"), |
| Token::RArrow => f.write_str("=>"), |
| Token::Sharp => f.write_str("#"), |
| Token::DoubleSharp => f.write_str("##"), |
| Token::ExclamationMark => f.write_str("!"), |
| Token::DoubleExclamationMark => f.write_str("!!"), |
| Token::Tilde => f.write_str("~"), |
| Token::TildeAsterisk => f.write_str("~*"), |
| Token::ExclamationMarkTilde => f.write_str("!~"), |
| Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"), |
| Token::DoubleTilde => f.write_str("~~"), |
| Token::DoubleTildeAsterisk => f.write_str("~~*"), |
| Token::ExclamationMarkDoubleTilde => f.write_str("!~~"), |
| Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"), |
| Token::AtSign => f.write_str("@"), |
| Token::CaretAt => f.write_str("^@"), |
| Token::ShiftLeft => f.write_str("<<"), |
| Token::ShiftRight => f.write_str(">>"), |
| Token::Overlap => f.write_str("&&"), |
| Token::PGSquareRoot => f.write_str("|/"), |
| Token::PGCubeRoot => f.write_str("||/"), |
| Token::AtDashAt => f.write_str("@-@"), |
| Token::QuestionMarkDash => f.write_str("?-"), |
| Token::AmpersandLeftAngleBracket => f.write_str("&<"), |
| Token::AmpersandRightAngleBracket => f.write_str("&>"), |
| Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"), |
| Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"), |
| Token::VerticalBarRightAngleBracket => f.write_str("|>"), |
| Token::TwoWayArrow => f.write_str("<->"), |
| Token::LeftAngleBracketCaret => f.write_str("<^"), |
| Token::RightAngleBracketCaret => f.write_str(">^"), |
| Token::QuestionMarkSharp => f.write_str("?#"), |
| Token::QuestionMarkDashVerticalBar => f.write_str("?-|"), |
| Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"), |
| Token::TildeEqual => f.write_str("~="), |
| Token::ShiftLeftVerticalBar => f.write_str("<<|"), |
| Token::VerticalBarShiftRight => f.write_str("|>>"), |
| Token::Placeholder(ref s) => write!(f, "{s}"), |
| Token::Arrow => write!(f, "->"), |
| Token::LongArrow => write!(f, "->>"), |
| Token::HashArrow => write!(f, "#>"), |
| Token::HashLongArrow => write!(f, "#>>"), |
| Token::AtArrow => write!(f, "@>"), |
| Token::ArrowAt => write!(f, "<@"), |
| Token::HashMinus => write!(f, "#-"), |
| Token::AtQuestion => write!(f, "@?"), |
| Token::AtAt => write!(f, "@@"), |
| Token::Question => write!(f, "?"), |
| Token::QuestionAnd => write!(f, "?&"), |
| Token::QuestionPipe => write!(f, "?|"), |
| Token::CustomBinaryOperator(s) => f.write_str(s), |
| } |
| } |
| } |
| |
| impl Token { |
| pub fn make_keyword(keyword: &str) -> Self { |
| Token::make_word(keyword, None) |
| } |
| |
| pub fn make_word(word: &str, quote_style: Option<char>) -> Self { |
| let word_uppercase = word.to_uppercase(); |
| Token::Word(Word { |
| value: word.to_string(), |
| quote_style, |
| keyword: if quote_style.is_none() { |
| let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str()); |
| keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x]) |
| } else { |
| Keyword::NoKeyword |
| }, |
| }) |
| } |
| } |
| |
| /// A keyword (like SELECT) or an optionally quoted SQL identifier |
| #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub struct Word { |
| /// The value of the token, without the enclosing quotes, and with the |
| /// escape sequences (if any) processed (TODO: escapes are not handled) |
| pub value: String, |
| /// An identifier can be "quoted" (<delimited identifier> in ANSI parlance). |
| /// The standard and most implementations allow using double quotes for this, |
| /// but some implementations support other quoting styles as well (e.g. \[MS SQL]) |
| pub quote_style: Option<char>, |
| /// If the word was not quoted and it matched one of the known keywords, |
| /// this will have one of the values from dialect::keywords, otherwise empty |
| pub keyword: Keyword, |
| } |
| |
| impl fmt::Display for Word { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match self.quote_style { |
| Some(s) if s == '"' || s == '[' || s == '`' => { |
| write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s)) |
| } |
| None => f.write_str(&self.value), |
| _ => panic!("Unexpected quote_style!"), |
| } |
| } |
| } |
| |
| impl Word { |
| fn matching_end_quote(ch: char) -> char { |
| match ch { |
| '"' => '"', // ANSI and most dialects |
| '[' => ']', // MS SQL |
| '`' => '`', // MySQL |
| _ => panic!("unexpected quoting style!"), |
| } |
| } |
| } |
| |
| #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub enum Whitespace { |
| Space, |
| Newline, |
| Tab, |
| SingleLineComment { comment: String, prefix: String }, |
| MultiLineComment(String), |
| } |
| |
| impl fmt::Display for Whitespace { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match self { |
| Whitespace::Space => f.write_str(" "), |
| Whitespace::Newline => f.write_str("\n"), |
| Whitespace::Tab => f.write_str("\t"), |
| Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"), |
| Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"), |
| } |
| } |
| } |
| |
| /// Location in input string |
| /// |
| /// # Create an "empty" (unknown) `Location` |
| /// ``` |
| /// # use sqlparser::tokenizer::Location; |
| /// let location = Location::empty(); |
| /// ``` |
| /// |
| /// # Create a `Location` from a line and column |
| /// ``` |
| /// # use sqlparser::tokenizer::Location; |
| /// let location = Location::new(1, 1); |
| /// ``` |
| /// |
| /// # Create a `Location` from a pair |
| /// ``` |
| /// # use sqlparser::tokenizer::Location; |
| /// let location = Location::from((1, 1)); |
| /// ``` |
| #[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub struct Location { |
| /// Line number, starting from 1. |
| /// |
| /// Note: Line 0 is used for empty spans |
| pub line: u64, |
| /// Line column, starting from 1. |
| /// |
| /// Note: Column 0 is used for empty spans |
| pub column: u64, |
| } |
| |
| impl fmt::Display for Location { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| if self.line == 0 { |
| return Ok(()); |
| } |
| write!(f, " at Line: {}, Column: {}", self.line, self.column) |
| } |
| } |
| |
| impl fmt::Debug for Location { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "Location({},{})", self.line, self.column) |
| } |
| } |
| |
| impl Location { |
| /// Return an "empty" / unknown location |
| pub fn empty() -> Self { |
| Self { line: 0, column: 0 } |
| } |
| |
| /// Create a new `Location` for a given line and column |
| pub fn new(line: u64, column: u64) -> Self { |
| Self { line, column } |
| } |
| |
| /// Create a new location for a given line and column |
| /// |
| /// Alias for [`Self::new`] |
| // TODO: remove / deprecate in favor of` `new` for consistency? |
| pub fn of(line: u64, column: u64) -> Self { |
| Self::new(line, column) |
| } |
| |
| /// Combine self and `end` into a new `Span` |
| pub fn span_to(self, end: Self) -> Span { |
| Span { start: self, end } |
| } |
| } |
| |
| impl From<(u64, u64)> for Location { |
| fn from((line, column): (u64, u64)) -> Self { |
| Self { line, column } |
| } |
| } |
| |
| /// A span represents a linear portion of the input string (start, end) |
| /// |
| /// See [Spanned](crate::ast::Spanned) for more information. |
| #[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub struct Span { |
| pub start: Location, |
| pub end: Location, |
| } |
| |
| impl fmt::Debug for Span { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "Span({:?}..{:?})", self.start, self.end) |
| } |
| } |
| |
| impl Span { |
| // An empty span (0, 0) -> (0, 0) |
| // We need a const instance for pattern matching |
| const EMPTY: Span = Self::empty(); |
| |
| /// Create a new span from a start and end [`Location`] |
| pub fn new(start: Location, end: Location) -> Span { |
| Span { start, end } |
| } |
| |
| /// Returns an empty span `(0, 0) -> (0, 0)` |
| /// |
| /// Empty spans represent no knowledge of source location |
| /// See [Spanned](crate::ast::Spanned) for more information. |
| pub const fn empty() -> Span { |
| Span { |
| start: Location { line: 0, column: 0 }, |
| end: Location { line: 0, column: 0 }, |
| } |
| } |
| |
| /// Returns the smallest Span that contains both `self` and `other` |
| /// If either span is [Span::empty], the other span is returned |
| /// |
| /// # Examples |
| /// ``` |
| /// # use sqlparser::tokenizer::{Span, Location}; |
| /// // line 1, column1 -> line 2, column 5 |
| /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5)); |
| /// // line 2, column 3 -> line 3, column 7 |
| /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7)); |
| /// // Union of the two is the min/max of the two spans |
| /// // line 1, column 1 -> line 3, column 7 |
| /// let union = span1.union(&span2); |
| /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7))); |
| /// ``` |
| pub fn union(&self, other: &Span) -> Span { |
| // If either span is empty, return the other |
| // this prevents propagating (0, 0) through the tree |
| match (self, other) { |
| (&Span::EMPTY, _) => *other, |
| (_, &Span::EMPTY) => *self, |
| _ => Span { |
| start: cmp::min(self.start, other.start), |
| end: cmp::max(self.end, other.end), |
| }, |
| } |
| } |
| |
| /// Same as [Span::union] for `Option<Span>` |
| /// |
| /// If `other` is `None`, `self` is returned |
| pub fn union_opt(&self, other: &Option<Span>) -> Span { |
| match other { |
| Some(other) => self.union(other), |
| None => *self, |
| } |
| } |
| |
| /// Return the [Span::union] of all spans in the iterator |
| /// |
| /// If the iterator is empty, an empty span is returned |
| /// |
| /// # Example |
| /// ``` |
| /// # use sqlparser::tokenizer::{Span, Location}; |
| /// let spans = vec![ |
| /// Span::new(Location::new(1, 1), Location::new(2, 5)), |
| /// Span::new(Location::new(2, 3), Location::new(3, 7)), |
| /// Span::new(Location::new(3, 1), Location::new(4, 2)), |
| /// ]; |
| /// // line 1, column 1 -> line 4, column 2 |
| /// assert_eq!( |
| /// Span::union_iter(spans), |
| /// Span::new(Location::new(1, 1), Location::new(4, 2)) |
| /// ); |
| pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span { |
| iter.into_iter() |
| .reduce(|acc, item| acc.union(&item)) |
| .unwrap_or(Span::empty()) |
| } |
| } |
| |
| /// Backwards compatibility struct for [`TokenWithSpan`] |
| #[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")] |
| pub type TokenWithLocation = TokenWithSpan; |
| |
| /// A [Token] with [Span] attached to it |
| /// |
| /// This is used to track the location of a token in the input string |
| /// |
| /// # Examples |
| /// ``` |
| /// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan}; |
| /// // commas @ line 1, column 10 |
| /// let tok1 = TokenWithSpan::new( |
| /// Token::Comma, |
| /// Span::new(Location::new(1, 10), Location::new(1, 11)), |
| /// ); |
| /// assert_eq!(tok1, Token::Comma); // can compare the token |
| /// |
| /// // commas @ line 2, column 20 |
| /// let tok2 = TokenWithSpan::new( |
| /// Token::Comma, |
| /// Span::new(Location::new(2, 20), Location::new(2, 21)), |
| /// ); |
| /// // same token but different locations are not equal |
| /// assert_ne!(tok1, tok2); |
| /// ``` |
| #[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)] |
| #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] |
| #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] |
| pub struct TokenWithSpan { |
| pub token: Token, |
| pub span: Span, |
| } |
| |
| impl TokenWithSpan { |
| /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`] |
| pub fn new(token: Token, span: Span) -> Self { |
| Self { token, span } |
| } |
| |
| /// Wrap a token with an empty span |
| pub fn wrap(token: Token) -> Self { |
| Self::new(token, Span::empty()) |
| } |
| |
| /// Wrap a token with a location from `start` to `end` |
| pub fn at(token: Token, start: Location, end: Location) -> Self { |
| Self::new(token, Span::new(start, end)) |
| } |
| |
| /// Return an EOF token with no location |
| pub fn new_eof() -> Self { |
| Self::wrap(Token::EOF) |
| } |
| } |
| |
| impl PartialEq<Token> for TokenWithSpan { |
| fn eq(&self, other: &Token) -> bool { |
| &self.token == other |
| } |
| } |
| |
| impl PartialEq<TokenWithSpan> for Token { |
| fn eq(&self, other: &TokenWithSpan) -> bool { |
| self == &other.token |
| } |
| } |
| |
| impl fmt::Display for TokenWithSpan { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| self.token.fmt(f) |
| } |
| } |
| |
| /// Tokenizer error |
| #[derive(Debug, PartialEq, Eq)] |
| pub struct TokenizerError { |
| pub message: String, |
| pub location: Location, |
| } |
| |
| impl fmt::Display for TokenizerError { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "{}{}", self.message, self.location,) |
| } |
| } |
| |
| #[cfg(feature = "std")] |
| impl std::error::Error for TokenizerError {} |
| |
| struct State<'a> { |
| peekable: Peekable<Chars<'a>>, |
| pub line: u64, |
| pub col: u64, |
| } |
| |
| impl State<'_> { |
| /// return the next character and advance the stream |
| pub fn next(&mut self) -> Option<char> { |
| match self.peekable.next() { |
| None => None, |
| Some(s) => { |
| if s == '\n' { |
| self.line += 1; |
| self.col = 1; |
| } else { |
| self.col += 1; |
| } |
| Some(s) |
| } |
| } |
| } |
| |
| /// return the next character but do not advance the stream |
| pub fn peek(&mut self) -> Option<&char> { |
| self.peekable.peek() |
| } |
| |
| pub fn location(&self) -> Location { |
| Location { |
| line: self.line, |
| column: self.col, |
| } |
| } |
| } |
| |
| /// Represents how many quote characters enclose a string literal. |
| #[derive(Copy, Clone)] |
| enum NumStringQuoteChars { |
| /// e.g. `"abc"`, `'abc'`, `r'abc'` |
| One, |
| /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''` |
| Many(NonZeroU8), |
| } |
| |
| /// Settings for tokenizing a quoted string literal. |
| struct TokenizeQuotedStringSettings { |
| /// The character used to quote the string. |
| quote_style: char, |
| /// Represents how many quotes characters enclose the string literal. |
| num_quote_chars: NumStringQuoteChars, |
| /// The number of opening quotes left to consume, before parsing |
| /// the remaining string literal. |
| /// For example: given initial string `"""abc"""`. If the caller has |
| /// already parsed the first quote for some reason, then this value |
| /// is set to 1, flagging to look to consume only 2 leading quotes. |
| num_opening_quotes_to_consume: u8, |
| /// True if the string uses backslash escaping of special characters |
| /// e.g `'abc\ndef\'ghi' |
| backslash_escape: bool, |
| } |
| |
| /// SQL Tokenizer |
| pub struct Tokenizer<'a> { |
| dialect: &'a dyn Dialect, |
| query: &'a str, |
| /// If true (the default), the tokenizer will un-escape literal |
| /// SQL strings See [`Tokenizer::with_unescape`] for more details. |
| unescape: bool, |
| } |
| |
| impl<'a> Tokenizer<'a> { |
| /// Create a new SQL tokenizer for the specified SQL statement |
| /// |
| /// ``` |
| /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer}; |
| /// # use sqlparser::dialect::GenericDialect; |
| /// # let dialect = GenericDialect{}; |
| /// let query = r#"SELECT 'foo'"#; |
| /// |
| /// // Parsing the query |
| /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap(); |
| /// |
| /// assert_eq!(tokens, vec![ |
| /// Token::make_word("SELECT", None), |
| /// Token::Whitespace(Whitespace::Space), |
| /// Token::SingleQuotedString("foo".to_string()), |
| /// ]); |
| pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { |
| Self { |
| dialect, |
| query, |
| unescape: true, |
| } |
| } |
| |
| /// Set unescape mode |
| /// |
| /// When true (default) the tokenizer unescapes literal values |
| /// (for example, `""` in SQL is unescaped to the literal `"`). |
| /// |
| /// When false, the tokenizer provides the raw strings as provided |
| /// in the query. This can be helpful for programs that wish to |
| /// recover the *exact* original query text without normalizing |
| /// the escaping |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// # use sqlparser::tokenizer::{Token, Tokenizer}; |
| /// # use sqlparser::dialect::GenericDialect; |
| /// # let dialect = GenericDialect{}; |
| /// let query = r#""Foo "" Bar""#; |
| /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"')); |
| /// let original = Token::make_word(r#"Foo "" Bar"#, Some('"')); |
| /// |
| /// // Parsing with unescaping (default) |
| /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap(); |
| /// assert_eq!(tokens, vec![unescaped]); |
| /// |
| /// // Parsing with unescape = false |
| /// let tokens = Tokenizer::new(&dialect, &query) |
| /// .with_unescape(false) |
| /// .tokenize().unwrap(); |
| /// assert_eq!(tokens, vec![original]); |
| /// ``` |
| pub fn with_unescape(mut self, unescape: bool) -> Self { |
| self.unescape = unescape; |
| self |
| } |
| |
| /// Tokenize the statement and produce a vector of tokens |
| pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> { |
| let twl = self.tokenize_with_location()?; |
| Ok(twl.into_iter().map(|t| t.token).collect()) |
| } |
| |
| /// Tokenize the statement and produce a vector of tokens with location information |
| pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> { |
| let mut tokens: Vec<TokenWithSpan> = vec![]; |
| self.tokenize_with_location_into_buf(&mut tokens) |
| .map(|_| tokens) |
| } |
| |
| /// Tokenize the statement and append tokens with location information into the provided buffer. |
| /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error. |
| pub fn tokenize_with_location_into_buf( |
| &mut self, |
| buf: &mut Vec<TokenWithSpan>, |
| ) -> Result<(), TokenizerError> { |
| let mut state = State { |
| peekable: self.query.chars().peekable(), |
| line: 1, |
| col: 1, |
| }; |
| |
| let mut location = state.location(); |
| while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? { |
| let span = location.span_to(state.location()); |
| |
| buf.push(TokenWithSpan { token, span }); |
| |
| location = state.location(); |
| } |
| Ok(()) |
| } |
| |
| // Tokenize the identifier or keywords in `ch` |
| fn tokenize_identifier_or_keyword( |
| &self, |
| ch: impl IntoIterator<Item = char>, |
| chars: &mut State, |
| ) -> Result<Option<Token>, TokenizerError> { |
| chars.next(); // consume the first char |
| let ch: String = ch.into_iter().collect(); |
| let word = self.tokenize_word(ch, chars); |
| |
| // TODO: implement parsing of exponent here |
| if word.chars().all(|x| x.is_ascii_digit() || x == '.') { |
| let mut inner_state = State { |
| peekable: word.chars().peekable(), |
| line: 0, |
| col: 0, |
| }; |
| let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.')); |
| let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); |
| s += s2.as_str(); |
| return Ok(Some(Token::Number(s, false))); |
| } |
| |
| Ok(Some(Token::make_word(&word, None))) |
| } |
| |
| /// Get the next token or return None |
| fn next_token( |
| &self, |
| chars: &mut State, |
| prev_token: Option<&Token>, |
| ) -> Result<Option<Token>, TokenizerError> { |
| match chars.peek() { |
| Some(&ch) => match ch { |
| ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), |
| '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)), |
| '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)), |
| '\r' => { |
| // Emit a single Whitespace::Newline token for \r and \r\n |
| chars.next(); |
| if let Some('\n') = chars.peek() { |
| chars.next(); |
| } |
| Ok(Some(Token::Whitespace(Whitespace::Newline))) |
| } |
| // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings |
| b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) => |
| { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('\'') => { |
| if self.dialect.supports_triple_quoted_string() { |
| return self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '\'', |
| false, |
| Token::SingleQuotedByteStringLiteral, |
| Token::TripleSingleQuotedByteStringLiteral, |
| ); |
| } |
| let s = self.tokenize_single_quoted_string(chars, '\'', false)?; |
| Ok(Some(Token::SingleQuotedByteStringLiteral(s))) |
| } |
| Some('\"') => { |
| if self.dialect.supports_triple_quoted_string() { |
| return self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '"', |
| false, |
| Token::DoubleQuotedByteStringLiteral, |
| Token::TripleDoubleQuotedByteStringLiteral, |
| ); |
| } |
| let s = self.tokenize_single_quoted_string(chars, '\"', false)?; |
| Ok(Some(Token::DoubleQuotedByteStringLiteral(s))) |
| } |
| _ => { |
| // regular identifier starting with an "b" or "B" |
| let s = self.tokenize_word(b, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| } |
| } |
| // BigQuery uses r or R for raw string literal |
| b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('\'') => self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '\'', |
| false, |
| Token::SingleQuotedRawStringLiteral, |
| Token::TripleSingleQuotedRawStringLiteral, |
| ), |
| Some('\"') => self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '"', |
| false, |
| Token::DoubleQuotedRawStringLiteral, |
| Token::TripleDoubleQuotedRawStringLiteral, |
| ), |
| _ => { |
| // regular identifier starting with an "r" or "R" |
| let s = self.tokenize_word(b, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| } |
| } |
| // Redshift uses lower case n for national string literal |
| n @ 'N' | n @ 'n' => { |
| chars.next(); // consume, to check the next char |
| match chars.peek() { |
| Some('\'') => { |
| // N'...' - a <national character string literal> |
| let backslash_escape = |
| self.dialect.supports_string_literal_backslash_escape(); |
| let s = |
| self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?; |
| Ok(Some(Token::NationalStringLiteral(s))) |
| } |
| _ => { |
| // regular identifier starting with an "N" |
| let s = self.tokenize_word(n, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| } |
| } |
| // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. |
| x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => { |
| let starting_loc = chars.location(); |
| chars.next(); // consume, to check the next char |
| match chars.peek() { |
| Some('\'') => { |
| let s = |
| self.tokenize_escaped_single_quoted_string(starting_loc, chars)?; |
| Ok(Some(Token::EscapedStringLiteral(s))) |
| } |
| _ => { |
| // regular identifier starting with an "E" or "e" |
| let s = self.tokenize_word(x, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| } |
| } |
| // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL |
| x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => { |
| chars.next(); // consume, to check the next char |
| if chars.peek() == Some(&'&') { |
| // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier |
| let mut chars_clone = chars.peekable.clone(); |
| chars_clone.next(); // consume the '&' in the clone |
| if chars_clone.peek() == Some(&'\'') { |
| chars.next(); // consume the '&' in the original iterator |
| let s = unescape_unicode_single_quoted_string(chars)?; |
| return Ok(Some(Token::UnicodeStringLiteral(s))); |
| } |
| } |
| // regular identifier starting with an "U" or "u" |
| let s = self.tokenize_word(x, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| // The spec only allows an uppercase 'X' to introduce a hex |
| // string, but PostgreSQL, at least, allows a lowercase 'x' too. |
| x @ 'x' | x @ 'X' => { |
| chars.next(); // consume, to check the next char |
| match chars.peek() { |
| Some('\'') => { |
| // X'...' - a <binary string literal> |
| let s = self.tokenize_single_quoted_string(chars, '\'', true)?; |
| Ok(Some(Token::HexStringLiteral(s))) |
| } |
| _ => { |
| // regular identifier starting with an "X" |
| let s = self.tokenize_word(x, chars); |
| Ok(Some(Token::make_word(&s, None))) |
| } |
| } |
| } |
| // single quoted string |
| '\'' => { |
| if self.dialect.supports_triple_quoted_string() { |
| return self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '\'', |
| self.dialect.supports_string_literal_backslash_escape(), |
| Token::SingleQuotedString, |
| Token::TripleSingleQuotedString, |
| ); |
| } |
| let s = self.tokenize_single_quoted_string( |
| chars, |
| '\'', |
| self.dialect.supports_string_literal_backslash_escape(), |
| )?; |
| |
| Ok(Some(Token::SingleQuotedString(s))) |
| } |
| // double quoted string |
| '\"' if !self.dialect.is_delimited_identifier_start(ch) |
| && !self.dialect.is_identifier_start(ch) => |
| { |
| if self.dialect.supports_triple_quoted_string() { |
| return self |
| .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>( |
| chars, |
| '"', |
| self.dialect.supports_string_literal_backslash_escape(), |
| Token::DoubleQuotedString, |
| Token::TripleDoubleQuotedString, |
| ); |
| } |
| let s = self.tokenize_single_quoted_string( |
| chars, |
| '"', |
| self.dialect.supports_string_literal_backslash_escape(), |
| )?; |
| |
| Ok(Some(Token::DoubleQuotedString(s))) |
| } |
| // delimited (quoted) identifier |
| quote_start if self.dialect.is_delimited_identifier_start(ch) => { |
| let word = self.tokenize_quoted_identifier(quote_start, chars)?; |
| Ok(Some(Token::make_word(&word, Some(quote_start)))) |
| } |
| // Potentially nested delimited (quoted) identifier |
| quote_start |
| if self |
| .dialect |
| .is_nested_delimited_identifier_start(quote_start) |
| && self |
| .dialect |
| .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) |
| .is_some() => |
| { |
| let Some((quote_start, nested_quote_start)) = self |
| .dialect |
| .peek_nested_delimited_identifier_quotes(chars.peekable.clone()) |
| else { |
| return self.tokenizer_error( |
| chars.location(), |
| format!("Expected nested delimiter '{quote_start}' before EOF."), |
| ); |
| }; |
| |
| let Some(nested_quote_start) = nested_quote_start else { |
| let word = self.tokenize_quoted_identifier(quote_start, chars)?; |
| return Ok(Some(Token::make_word(&word, Some(quote_start)))); |
| }; |
| |
| let mut word = vec![]; |
| let quote_end = Word::matching_end_quote(quote_start); |
| let nested_quote_end = Word::matching_end_quote(nested_quote_start); |
| let error_loc = chars.location(); |
| |
| chars.next(); // skip the first delimiter |
| peeking_take_while(chars, |ch| ch.is_whitespace()); |
| if chars.peek() != Some(&nested_quote_start) { |
| return self.tokenizer_error( |
| error_loc, |
| format!("Expected nested delimiter '{nested_quote_start}' before EOF."), |
| ); |
| } |
| word.push(nested_quote_start.into()); |
| word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?); |
| word.push(nested_quote_end.into()); |
| peeking_take_while(chars, |ch| ch.is_whitespace()); |
| if chars.peek() != Some("e_end) { |
| return self.tokenizer_error( |
| error_loc, |
| format!("Expected close delimiter '{quote_end}' before EOF."), |
| ); |
| } |
| chars.next(); // skip close delimiter |
| |
| Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) |
| } |
| // numbers and period |
| '0'..='9' | '.' => { |
| // special case where if ._ is encountered after a word then that word |
| // is a table and the _ is the start of the col name. |
| // if the prev token is not a word, then this is not a valid sql |
| // word or number. |
| if ch == '.' && chars.peekable.clone().nth(1) == Some('_') { |
| if let Some(Token::Word(_)) = prev_token { |
| chars.next(); |
| return Ok(Some(Token::Period)); |
| } |
| |
| return self.tokenizer_error( |
| chars.location(), |
| "Unexpected character '_'".to_string(), |
| ); |
| } |
| |
| // Some dialects support underscore as number separator |
| // There can only be one at a time and it must be followed by another digit |
| let is_number_separator = |ch: char, next_char: Option<char>| { |
| self.dialect.supports_numeric_literal_underscores() |
| && ch == '_' |
| && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) |
| }; |
| |
| let mut s = peeking_next_take_while(chars, |ch, next_ch| { |
| ch.is_ascii_digit() || is_number_separator(ch, next_ch) |
| }); |
| |
| // match binary literal that starts with 0x |
| if s == "0" && chars.peek() == Some(&'x') { |
| chars.next(); |
| let s2 = peeking_next_take_while(chars, |ch, next_ch| { |
| ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch) |
| }); |
| return Ok(Some(Token::HexStringLiteral(s2))); |
| } |
| |
| // match one period |
| if let Some('.') = chars.peek() { |
| s.push('.'); |
| chars.next(); |
| } |
| |
| // If the dialect supports identifiers that start with a numeric prefix |
| // and we have now consumed a dot, check if the previous token was a Word. |
| // If so, what follows is definitely not part of a decimal number and |
| // we should yield the dot as a dedicated token so compound identifiers |
| // starting with digits can be parsed correctly. |
| if s == "." && self.dialect.supports_numeric_prefix() { |
| if let Some(Token::Word(_)) = prev_token { |
| return Ok(Some(Token::Period)); |
| } |
| } |
| |
| // Consume fractional digits. |
| s += &peeking_next_take_while(chars, |ch, next_ch| { |
| ch.is_ascii_digit() || is_number_separator(ch, next_ch) |
| }); |
| |
| // No fraction -> Token::Period |
| if s == "." { |
| return Ok(Some(Token::Period)); |
| } |
| |
| // Parse exponent as number |
| let mut exponent_part = String::new(); |
| if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { |
| let mut char_clone = chars.peekable.clone(); |
| exponent_part.push(char_clone.next().unwrap()); |
| |
| // Optional sign |
| match char_clone.peek() { |
| Some(&c) if matches!(c, '+' | '-') => { |
| exponent_part.push(c); |
| char_clone.next(); |
| } |
| _ => (), |
| } |
| |
| match char_clone.peek() { |
| // Definitely an exponent, get original iterator up to speed and use it |
| Some(&c) if c.is_ascii_digit() => { |
| for _ in 0..exponent_part.len() { |
| chars.next(); |
| } |
| exponent_part += |
| &peeking_take_while(chars, |ch| ch.is_ascii_digit()); |
| s += exponent_part.as_str(); |
| } |
| // Not an exponent, discard the work done |
| _ => (), |
| } |
| } |
| |
| // If the dialect supports identifiers that start with a numeric prefix, |
| // we need to check if the value is in fact an identifier and must thus |
| // be tokenized as a word. |
| if self.dialect.supports_numeric_prefix() { |
| if exponent_part.is_empty() { |
| // If it is not a number with an exponent, it may be |
| // an identifier starting with digits. |
| let word = |
| peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); |
| |
| if !word.is_empty() { |
| s += word.as_str(); |
| return Ok(Some(Token::make_word(s.as_str(), None))); |
| } |
| } else if prev_token == Some(&Token::Period) { |
| // If the previous token was a period, thus not belonging to a number, |
| // the value we have is part of an identifier. |
| return Ok(Some(Token::make_word(s.as_str(), None))); |
| } |
| } |
| |
| let long = if chars.peek() == Some(&'L') { |
| chars.next(); |
| true |
| } else { |
| false |
| }; |
| Ok(Some(Token::Number(s, long))) |
| } |
| // punctuation |
| '(' => self.consume_and_return(chars, Token::LParen), |
| ')' => self.consume_and_return(chars, Token::RParen), |
| ',' => self.consume_and_return(chars, Token::Comma), |
| // operators |
| '-' => { |
| chars.next(); // consume the '-' |
| |
| match chars.peek() { |
| Some('-') => { |
| let mut is_comment = true; |
| if self.dialect.requires_single_line_comment_whitespace() { |
| is_comment = Some(' ') == chars.peekable.clone().nth(1); |
| } |
| |
| if is_comment { |
| chars.next(); // consume second '-' |
| let comment = self.tokenize_single_line_comment(chars); |
| return Ok(Some(Token::Whitespace( |
| Whitespace::SingleLineComment { |
| prefix: "--".to_owned(), |
| comment, |
| }, |
| ))); |
| } |
| |
| self.start_binop(chars, "-", Token::Minus) |
| } |
| Some('>') => { |
| chars.next(); |
| match chars.peek() { |
| Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow), |
| _ => self.start_binop(chars, "->", Token::Arrow), |
| } |
| } |
| // a regular '-' operator |
| _ => self.start_binop(chars, "-", Token::Minus), |
| } |
| } |
| '/' => { |
| chars.next(); // consume the '/' |
| match chars.peek() { |
| Some('*') => { |
| chars.next(); // consume the '*', starting a multi-line comment |
| self.tokenize_multiline_comment(chars) |
| } |
| Some('/') if dialect_of!(self is SnowflakeDialect) => { |
| chars.next(); // consume the second '/', starting a snowflake single-line comment |
| let comment = self.tokenize_single_line_comment(chars); |
| Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "//".to_owned(), |
| comment, |
| }))) |
| } |
| Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => { |
| self.consume_and_return(chars, Token::DuckIntDiv) |
| } |
| // a regular '/' operator |
| _ => Ok(Some(Token::Div)), |
| } |
| } |
| '+' => self.consume_and_return(chars, Token::Plus), |
| '*' => self.consume_and_return(chars, Token::Mul), |
| '%' => { |
| chars.next(); // advance past '%' |
| match chars.peek() { |
| Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)), |
| Some(sch) if self.dialect.is_identifier_start('%') => { |
| self.tokenize_identifier_or_keyword([ch, *sch], chars) |
| } |
| _ => self.start_binop(chars, "%", Token::Mod), |
| } |
| } |
| '|' => { |
| chars.next(); // consume the '|' |
| match chars.peek() { |
| Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot), |
| Some('|') => { |
| chars.next(); // consume the second '|' |
| match chars.peek() { |
| Some('/') => { |
| self.consume_for_binop(chars, "||/", Token::PGCubeRoot) |
| } |
| _ => self.start_binop(chars, "||", Token::StringConcat), |
| } |
| } |
| Some('&') if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('>') => self.consume_for_binop( |
| chars, |
| "|&>", |
| Token::VerticalBarAmpersandRightAngleBracket, |
| ), |
| _ => self.start_binop_opt(chars, "|&", None), |
| } |
| } |
| Some('>') if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('>') => self.consume_for_binop( |
| chars, |
| "|>>", |
| Token::VerticalBarShiftRight, |
| ), |
| _ => self.start_binop_opt(chars, "|>", None), |
| } |
| } |
| Some('>') if self.dialect.supports_pipe_operator() => { |
| self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket) |
| } |
| // Bitshift '|' operator |
| _ => self.start_binop(chars, "|", Token::Pipe), |
| } |
| } |
| '=' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('>') => self.consume_and_return(chars, Token::RArrow), |
| Some('=') => self.consume_and_return(chars, Token::DoubleEq), |
| _ => Ok(Some(Token::Eq)), |
| } |
| } |
| '!' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('=') => self.consume_and_return(chars, Token::Neq), |
| Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark), |
| Some('~') => { |
| chars.next(); |
| match chars.peek() { |
| Some('*') => self |
| .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk), |
| Some('~') => { |
| chars.next(); |
| match chars.peek() { |
| Some('*') => self.consume_and_return( |
| chars, |
| Token::ExclamationMarkDoubleTildeAsterisk, |
| ), |
| _ => Ok(Some(Token::ExclamationMarkDoubleTilde)), |
| } |
| } |
| _ => Ok(Some(Token::ExclamationMarkTilde)), |
| } |
| } |
| _ => Ok(Some(Token::ExclamationMark)), |
| } |
| } |
| '<' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('=') => { |
| chars.next(); |
| match chars.peek() { |
| Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship), |
| _ => self.start_binop(chars, "<=", Token::LtEq), |
| } |
| } |
| Some('|') if self.dialect.supports_geometric_types() => { |
| self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar) |
| } |
| Some('>') => self.consume_for_binop(chars, "<>", Token::Neq), |
| Some('<') if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('|') => self.consume_for_binop( |
| chars, |
| "<<|", |
| Token::ShiftLeftVerticalBar, |
| ), |
| _ => self.start_binop(chars, "<<", Token::ShiftLeft), |
| } |
| } |
| Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft), |
| Some('-') if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('>') => { |
| self.consume_for_binop(chars, "<->", Token::TwoWayArrow) |
| } |
| _ => self.start_binop_opt(chars, "<-", None), |
| } |
| } |
| Some('^') if self.dialect.supports_geometric_types() => { |
| self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret) |
| } |
| Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt), |
| _ => self.start_binop(chars, "<", Token::Lt), |
| } |
| } |
| '>' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq), |
| Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight), |
| Some('^') if self.dialect.supports_geometric_types() => { |
| self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret) |
| } |
| _ => self.start_binop(chars, ">", Token::Gt), |
| } |
| } |
| ':' => { |
| chars.next(); |
| match chars.peek() { |
| Some(':') => self.consume_and_return(chars, Token::DoubleColon), |
| Some('=') => self.consume_and_return(chars, Token::Assignment), |
| _ => Ok(Some(Token::Colon)), |
| } |
| } |
| ';' => self.consume_and_return(chars, Token::SemiColon), |
| '\\' => self.consume_and_return(chars, Token::Backslash), |
| '[' => self.consume_and_return(chars, Token::LBracket), |
| ']' => self.consume_and_return(chars, Token::RBracket), |
| '&' => { |
| chars.next(); // consume the '&' |
| match chars.peek() { |
| Some('>') if self.dialect.supports_geometric_types() => { |
| chars.next(); |
| self.consume_and_return(chars, Token::AmpersandRightAngleBracket) |
| } |
| Some('<') if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('|') => self.consume_and_return( |
| chars, |
| Token::AmpersandLeftAngleBracketVerticalBar, |
| ), |
| _ => { |
| self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket) |
| } |
| } |
| } |
| Some('&') => { |
| chars.next(); // consume the second '&' |
| self.start_binop(chars, "&&", Token::Overlap) |
| } |
| // Bitshift '&' operator |
| _ => self.start_binop(chars, "&", Token::Ampersand), |
| } |
| } |
| '^' => { |
| chars.next(); // consume the '^' |
| match chars.peek() { |
| Some('@') => self.consume_and_return(chars, Token::CaretAt), |
| _ => Ok(Some(Token::Caret)), |
| } |
| } |
| '{' => self.consume_and_return(chars, Token::LBrace), |
| '}' => self.consume_and_return(chars, Token::RBrace), |
| '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) => |
| { |
| chars.next(); // consume the '#', starting a snowflake single-line comment |
| let comment = self.tokenize_single_line_comment(chars); |
| Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "#".to_owned(), |
| comment, |
| }))) |
| } |
| '~' => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk), |
| Some('=') if self.dialect.supports_geometric_types() => { |
| self.consume_for_binop(chars, "~=", Token::TildeEqual) |
| } |
| Some('~') => { |
| chars.next(); |
| match chars.peek() { |
| Some('*') => { |
| self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk) |
| } |
| _ => self.start_binop(chars, "~~", Token::DoubleTilde), |
| } |
| } |
| _ => self.start_binop(chars, "~", Token::Tilde), |
| } |
| } |
| '#' => { |
| chars.next(); |
| match chars.peek() { |
| Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus), |
| Some('>') => { |
| chars.next(); |
| match chars.peek() { |
| Some('>') => { |
| self.consume_for_binop(chars, "#>>", Token::HashLongArrow) |
| } |
| _ => self.start_binop(chars, "#>", Token::HashArrow), |
| } |
| } |
| Some(' ') => Ok(Some(Token::Sharp)), |
| Some('#') if self.dialect.supports_geometric_types() => { |
| self.consume_for_binop(chars, "##", Token::DoubleSharp) |
| } |
| Some(sch) if self.dialect.is_identifier_start('#') => { |
| self.tokenize_identifier_or_keyword([ch, *sch], chars) |
| } |
| _ => self.start_binop(chars, "#", Token::Sharp), |
| } |
| } |
| '@' => { |
| chars.next(); |
| match chars.peek() { |
| Some('@') if self.dialect.supports_geometric_types() => { |
| self.consume_and_return(chars, Token::AtAt) |
| } |
| Some('-') if self.dialect.supports_geometric_types() => { |
| chars.next(); |
| match chars.peek() { |
| Some('@') => self.consume_and_return(chars, Token::AtDashAt), |
| _ => self.start_binop_opt(chars, "@-", None), |
| } |
| } |
| Some('>') => self.consume_and_return(chars, Token::AtArrow), |
| Some('?') => self.consume_and_return(chars, Token::AtQuestion), |
| Some('@') => { |
| chars.next(); |
| match chars.peek() { |
| Some(' ') => Ok(Some(Token::AtAt)), |
| Some(tch) if self.dialect.is_identifier_start('@') => { |
| self.tokenize_identifier_or_keyword([ch, '@', *tch], chars) |
| } |
| _ => Ok(Some(Token::AtAt)), |
| } |
| } |
| Some(' ') => Ok(Some(Token::AtSign)), |
| // We break on quotes here, because no dialect allows identifiers starting |
| // with @ and containing quotation marks (e.g. `@'foo'`) unless they are |
| // quoted, which is tokenized as a quoted string, not here (e.g. |
| // `"@'foo'"`). Further, at least two dialects parse `@` followed by a |
| // quoted string as two separate tokens, which this allows. For example, |
| // Postgres parses `@'1'` as the absolute value of '1' which is implicitly |
| // cast to a numeric type. And when parsing MySQL-style grantees (e.g. |
| // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens |
| // for the user, the `@`, and the host. |
| Some('\'') => Ok(Some(Token::AtSign)), |
| Some('\"') => Ok(Some(Token::AtSign)), |
| Some('`') => Ok(Some(Token::AtSign)), |
| Some(sch) if self.dialect.is_identifier_start('@') => { |
| self.tokenize_identifier_or_keyword([ch, *sch], chars) |
| } |
| _ => Ok(Some(Token::AtSign)), |
| } |
| } |
| // Postgres uses ? for jsonb operators, not prepared statements |
| '?' if self.dialect.supports_geometric_types() => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('|') => { |
| chars.next(); |
| match chars.peek() { |
| Some('|') => self.consume_and_return( |
| chars, |
| Token::QuestionMarkDoubleVerticalBar, |
| ), |
| _ => Ok(Some(Token::QuestionPipe)), |
| } |
| } |
| |
| Some('&') => self.consume_and_return(chars, Token::QuestionAnd), |
| Some('-') => { |
| chars.next(); // consume |
| match chars.peek() { |
| Some('|') => self |
| .consume_and_return(chars, Token::QuestionMarkDashVerticalBar), |
| _ => Ok(Some(Token::QuestionMarkDash)), |
| } |
| } |
| Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp), |
| _ => self.consume_and_return(chars, Token::Question), |
| } |
| } |
| '?' => { |
| chars.next(); |
| let s = peeking_take_while(chars, |ch| ch.is_numeric()); |
| Ok(Some(Token::Placeholder(String::from("?") + &s))) |
| } |
| |
| // identifier or keyword |
| ch if self.dialect.is_identifier_start(ch) => { |
| self.tokenize_identifier_or_keyword([ch], chars) |
| } |
| '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)), |
| |
| // whitespace check (including unicode chars) should be last as it covers some of the chars above |
| ch if ch.is_whitespace() => { |
| self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)) |
| } |
| other => self.consume_and_return(chars, Token::Char(other)), |
| }, |
| None => Ok(None), |
| } |
| } |
| |
| /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix |
| fn consume_for_binop( |
| &self, |
| chars: &mut State, |
| prefix: &str, |
| default: Token, |
| ) -> Result<Option<Token>, TokenizerError> { |
| chars.next(); // consume the first char |
| self.start_binop_opt(chars, prefix, Some(default)) |
| } |
| |
| /// parse a custom binary operator |
| fn start_binop( |
| &self, |
| chars: &mut State, |
| prefix: &str, |
| default: Token, |
| ) -> Result<Option<Token>, TokenizerError> { |
| self.start_binop_opt(chars, prefix, Some(default)) |
| } |
| |
| /// parse a custom binary operator |
| fn start_binop_opt( |
| &self, |
| chars: &mut State, |
| prefix: &str, |
| default: Option<Token>, |
| ) -> Result<Option<Token>, TokenizerError> { |
| let mut custom = None; |
| while let Some(&ch) = chars.peek() { |
| if !self.dialect.is_custom_operator_part(ch) { |
| break; |
| } |
| |
| custom.get_or_insert_with(|| prefix.to_string()).push(ch); |
| chars.next(); |
| } |
| match (custom, default) { |
| (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()), |
| (None, Some(tok)) => Ok(Some(tok)), |
| (None, None) => self.tokenizer_error( |
| chars.location(), |
| format!("Expected a valid binary operator after '{prefix}'"), |
| ), |
| } |
| } |
| |
| /// Tokenize dollar preceded value (i.e: a string/placeholder) |
| fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> { |
| let mut s = String::new(); |
| let mut value = String::new(); |
| |
| chars.next(); |
| |
| // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder. |
| if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() { |
| chars.next(); |
| |
| let mut is_terminated = false; |
| let mut prev: Option<char> = None; |
| |
| while let Some(&ch) = chars.peek() { |
| if prev == Some('$') { |
| if ch == '$' { |
| chars.next(); |
| is_terminated = true; |
| break; |
| } else { |
| s.push('$'); |
| s.push(ch); |
| } |
| } else if ch != '$' { |
| s.push(ch); |
| } |
| |
| prev = Some(ch); |
| chars.next(); |
| } |
| |
| return if chars.peek().is_none() && !is_terminated { |
| self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string") |
| } else { |
| Ok(Token::DollarQuotedString(DollarQuotedString { |
| value: s, |
| tag: None, |
| })) |
| }; |
| } else { |
| value.push_str(&peeking_take_while(chars, |ch| { |
| ch.is_alphanumeric() |
| || ch == '_' |
| // Allow $ as a placeholder character if the dialect supports it |
| || matches!(ch, '$' if self.dialect.supports_dollar_placeholder()) |
| })); |
| |
| // If the dialect does not support dollar-quoted strings, don't look for the end delimiter. |
| if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() { |
| chars.next(); |
| |
| let mut temp = String::new(); |
| let end_delimiter = format!("${value}$"); |
| |
| loop { |
| match chars.next() { |
| Some(ch) => { |
| temp.push(ch); |
| |
| if temp.ends_with(&end_delimiter) { |
| if let Some(temp) = temp.strip_suffix(&end_delimiter) { |
| s.push_str(temp); |
| } |
| break; |
| } |
| } |
| None => { |
| if temp.ends_with(&end_delimiter) { |
| if let Some(temp) = temp.strip_suffix(&end_delimiter) { |
| s.push_str(temp); |
| } |
| break; |
| } |
| |
| return self.tokenizer_error( |
| chars.location(), |
| "Unterminated dollar-quoted, expected $", |
| ); |
| } |
| } |
| } |
| } else { |
| return Ok(Token::Placeholder(String::from("$") + &value)); |
| } |
| } |
| |
| Ok(Token::DollarQuotedString(DollarQuotedString { |
| value: s, |
| tag: if value.is_empty() { None } else { Some(value) }, |
| })) |
| } |
| |
| fn tokenizer_error<R>( |
| &self, |
| loc: Location, |
| message: impl Into<String>, |
| ) -> Result<R, TokenizerError> { |
| Err(TokenizerError { |
| message: message.into(), |
| location: loc, |
| }) |
| } |
| |
| // Consume characters until newline |
| fn tokenize_single_line_comment(&self, chars: &mut State) -> String { |
| let mut comment = peeking_take_while(chars, |ch| match ch { |
| '\n' => false, // Always stop at \n |
| '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres |
| _ => true, // Keep consuming for other characters |
| }); |
| |
| if let Some(ch) = chars.next() { |
| assert!(ch == '\n' || ch == '\r'); |
| comment.push(ch); |
| } |
| |
| comment |
| } |
| |
| /// Tokenize an identifier or keyword, after the first char is already consumed. |
| fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String { |
| let mut s = first_chars.into(); |
| s.push_str(&peeking_take_while(chars, |ch| { |
| self.dialect.is_identifier_part(ch) |
| })); |
| s |
| } |
| |
| /// Read a quoted identifier |
| fn tokenize_quoted_identifier( |
| &self, |
| quote_start: char, |
| chars: &mut State, |
| ) -> Result<String, TokenizerError> { |
| let error_loc = chars.location(); |
| chars.next(); // consume the opening quote |
| let quote_end = Word::matching_end_quote(quote_start); |
| let (s, last_char) = self.parse_quoted_ident(chars, quote_end); |
| |
| if last_char == Some(quote_end) { |
| Ok(s) |
| } else { |
| self.tokenizer_error( |
| error_loc, |
| format!("Expected close delimiter '{quote_end}' before EOF."), |
| ) |
| } |
| } |
| |
| /// Read a single quoted string, starting with the opening quote. |
| fn tokenize_escaped_single_quoted_string( |
| &self, |
| starting_loc: Location, |
| chars: &mut State, |
| ) -> Result<String, TokenizerError> { |
| if let Some(s) = unescape_single_quoted_string(chars) { |
| return Ok(s); |
| } |
| |
| self.tokenizer_error(starting_loc, "Unterminated encoded string literal") |
| } |
| |
| /// Reads a string literal quoted by a single or triple quote characters. |
| /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`. |
| fn tokenize_single_or_triple_quoted_string<F>( |
| &self, |
| chars: &mut State, |
| quote_style: char, |
| backslash_escape: bool, |
| single_quote_token: F, |
| triple_quote_token: F, |
| ) -> Result<Option<Token>, TokenizerError> |
| where |
| F: Fn(String) -> Token, |
| { |
| let error_loc = chars.location(); |
| |
| let mut num_opening_quotes = 0u8; |
| for _ in 0..3 { |
| if Some("e_style) == chars.peek() { |
| chars.next(); // Consume quote. |
| num_opening_quotes += 1; |
| } else { |
| break; |
| } |
| } |
| |
| let (token_fn, num_quote_chars) = match num_opening_quotes { |
| 1 => (single_quote_token, NumStringQuoteChars::One), |
| 2 => { |
| // If we matched double quotes, then this is an empty string. |
| return Ok(Some(single_quote_token("".into()))); |
| } |
| 3 => { |
| let Some(num_quote_chars) = NonZeroU8::new(3) else { |
| return self.tokenizer_error(error_loc, "invalid number of opening quotes"); |
| }; |
| ( |
| triple_quote_token, |
| NumStringQuoteChars::Many(num_quote_chars), |
| ) |
| } |
| _ => { |
| return self.tokenizer_error(error_loc, "invalid string literal opening"); |
| } |
| }; |
| |
| let settings = TokenizeQuotedStringSettings { |
| quote_style, |
| num_quote_chars, |
| num_opening_quotes_to_consume: 0, |
| backslash_escape, |
| }; |
| |
| self.tokenize_quoted_string(chars, settings) |
| .map(token_fn) |
| .map(Some) |
| } |
| |
| /// Reads a string literal quoted by a single quote character. |
| fn tokenize_single_quoted_string( |
| &self, |
| chars: &mut State, |
| quote_style: char, |
| backslash_escape: bool, |
| ) -> Result<String, TokenizerError> { |
| self.tokenize_quoted_string( |
| chars, |
| TokenizeQuotedStringSettings { |
| quote_style, |
| num_quote_chars: NumStringQuoteChars::One, |
| num_opening_quotes_to_consume: 1, |
| backslash_escape, |
| }, |
| ) |
| } |
| |
| /// Read a quoted string. |
| fn tokenize_quoted_string( |
| &self, |
| chars: &mut State, |
| settings: TokenizeQuotedStringSettings, |
| ) -> Result<String, TokenizerError> { |
| let mut s = String::new(); |
| let error_loc = chars.location(); |
| |
| // Consume any opening quotes. |
| for _ in 0..settings.num_opening_quotes_to_consume { |
| if Some(settings.quote_style) != chars.next() { |
| return self.tokenizer_error(error_loc, "invalid string literal opening"); |
| } |
| } |
| |
| let mut num_consecutive_quotes = 0; |
| while let Some(&ch) = chars.peek() { |
| let pending_final_quote = match settings.num_quote_chars { |
| NumStringQuoteChars::One => Some(NumStringQuoteChars::One), |
| n @ NumStringQuoteChars::Many(count) |
| if num_consecutive_quotes + 1 == count.get() => |
| { |
| Some(n) |
| } |
| NumStringQuoteChars::Many(_) => None, |
| }; |
| |
| match ch { |
| char if char == settings.quote_style && pending_final_quote.is_some() => { |
| chars.next(); // consume |
| |
| if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote { |
| // For an initial string like `"""abc"""`, at this point we have |
| // `abc""` in the buffer and have now matched the final `"`. |
| // However, the string to return is simply `abc`, so we strip off |
| // the trailing quotes before returning. |
| let mut buf = s.chars(); |
| for _ in 1..count.get() { |
| buf.next_back(); |
| } |
| return Ok(buf.as_str().to_string()); |
| } else if chars |
| .peek() |
| .map(|c| *c == settings.quote_style) |
| .unwrap_or(false) |
| { |
| s.push(ch); |
| if !self.unescape { |
| // In no-escape mode, the given query has to be saved completely |
| s.push(ch); |
| } |
| chars.next(); |
| } else { |
| return Ok(s); |
| } |
| } |
| '\\' if settings.backslash_escape => { |
| // consume backslash |
| chars.next(); |
| |
| num_consecutive_quotes = 0; |
| |
| if let Some(next) = chars.peek() { |
| if !self.unescape |
| || (self.dialect.ignores_wildcard_escapes() |
| && (*next == '%' || *next == '_')) |
| { |
| // In no-escape mode, the given query has to be saved completely |
| // including backslashes. Similarly, with ignore_like_wildcard_escapes, |
| // the backslash is not stripped. |
| s.push(ch); |
| s.push(*next); |
| chars.next(); // consume next |
| } else { |
| let n = match next { |
| '0' => '\0', |
| 'a' => '\u{7}', |
| 'b' => '\u{8}', |
| 'f' => '\u{c}', |
| 'n' => '\n', |
| 'r' => '\r', |
| 't' => '\t', |
| 'Z' => '\u{1a}', |
| _ => *next, |
| }; |
| s.push(n); |
| chars.next(); // consume next |
| } |
| } |
| } |
| ch => { |
| chars.next(); // consume ch |
| |
| if ch == settings.quote_style { |
| num_consecutive_quotes += 1; |
| } else { |
| num_consecutive_quotes = 0; |
| } |
| |
| s.push(ch); |
| } |
| } |
| } |
| self.tokenizer_error(error_loc, "Unterminated string literal") |
| } |
| |
| fn tokenize_multiline_comment( |
| &self, |
| chars: &mut State, |
| ) -> Result<Option<Token>, TokenizerError> { |
| let mut s = String::new(); |
| let mut nested = 1; |
| let supports_nested_comments = self.dialect.supports_nested_comments(); |
| |
| loop { |
| match chars.next() { |
| Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => { |
| chars.next(); // consume the '*' |
| s.push('/'); |
| s.push('*'); |
| nested += 1; |
| } |
| Some('*') if matches!(chars.peek(), Some('/')) => { |
| chars.next(); // consume the '/' |
| nested -= 1; |
| if nested == 0 { |
| break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); |
| } |
| s.push('*'); |
| s.push('/'); |
| } |
| Some(ch) => { |
| s.push(ch); |
| } |
| None => { |
| break self.tokenizer_error( |
| chars.location(), |
| "Unexpected EOF while in a multi-line comment", |
| ); |
| } |
| } |
| } |
| } |
| |
| fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) { |
| let mut last_char = None; |
| let mut s = String::new(); |
| while let Some(ch) = chars.next() { |
| if ch == quote_end { |
| if chars.peek() == Some("e_end) { |
| chars.next(); |
| s.push(ch); |
| if !self.unescape { |
| // In no-escape mode, the given query has to be saved completely |
| s.push(ch); |
| } |
| } else { |
| last_char = Some(quote_end); |
| break; |
| } |
| } else { |
| s.push(ch); |
| } |
| } |
| (s, last_char) |
| } |
| |
| #[allow(clippy::unnecessary_wraps)] |
| fn consume_and_return( |
| &self, |
| chars: &mut State, |
| t: Token, |
| ) -> Result<Option<Token>, TokenizerError> { |
| chars.next(); |
| Ok(Some(t)) |
| } |
| } |
| |
| /// Read from `chars` until `predicate` returns `false` or EOF is hit. |
| /// Return the characters read as String, and keep the first non-matching |
| /// char available as `chars.next()`. |
| fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String { |
| let mut s = String::new(); |
| while let Some(&ch) = chars.peek() { |
| if predicate(ch) { |
| chars.next(); // consume |
| s.push(ch); |
| } else { |
| break; |
| } |
| } |
| s |
| } |
| |
| /// Same as peeking_take_while, but also passes the next character to the predicate. |
| fn peeking_next_take_while( |
| chars: &mut State, |
| mut predicate: impl FnMut(char, Option<char>) -> bool, |
| ) -> String { |
| let mut s = String::new(); |
| while let Some(&ch) = chars.peek() { |
| let next_char = chars.peekable.clone().nth(1); |
| if predicate(ch, next_char) { |
| chars.next(); // consume |
| s.push(ch); |
| } else { |
| break; |
| } |
| } |
| s |
| } |
| |
| fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> { |
| Unescape::new(chars).unescape() |
| } |
| |
| struct Unescape<'a: 'b, 'b> { |
| chars: &'b mut State<'a>, |
| } |
| |
| impl<'a: 'b, 'b> Unescape<'a, 'b> { |
| fn new(chars: &'b mut State<'a>) -> Self { |
| Self { chars } |
| } |
| fn unescape(mut self) -> Option<String> { |
| let mut unescaped = String::new(); |
| |
| self.chars.next(); |
| |
| while let Some(c) = self.chars.next() { |
| if c == '\'' { |
| // case: '''' |
| if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) { |
| self.chars.next(); |
| unescaped.push('\''); |
| continue; |
| } |
| return Some(unescaped); |
| } |
| |
| if c != '\\' { |
| unescaped.push(c); |
| continue; |
| } |
| |
| let c = match self.chars.next()? { |
| 'b' => '\u{0008}', |
| 'f' => '\u{000C}', |
| 'n' => '\n', |
| 'r' => '\r', |
| 't' => '\t', |
| 'u' => self.unescape_unicode_16()?, |
| 'U' => self.unescape_unicode_32()?, |
| 'x' => self.unescape_hex()?, |
| c if c.is_digit(8) => self.unescape_octal(c)?, |
| c => c, |
| }; |
| |
| unescaped.push(Self::check_null(c)?); |
| } |
| |
| None |
| } |
| |
| #[inline] |
| fn check_null(c: char) -> Option<char> { |
| if c == '\0' { |
| None |
| } else { |
| Some(c) |
| } |
| } |
| |
| #[inline] |
| fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> { |
| // u32 is used here because Pg has an overflow operation rather than throwing an exception directly. |
| match u32::from_str_radix(s, RADIX) { |
| Err(_) => None, |
| Ok(n) => { |
| let n = n & 0xFF; |
| if n <= 127 { |
| char::from_u32(n) |
| } else { |
| None |
| } |
| } |
| } |
| } |
| |
| // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F) |
| fn unescape_hex(&mut self) -> Option<char> { |
| let mut s = String::new(); |
| |
| for _ in 0..2 { |
| match self.next_hex_digit() { |
| Some(c) => s.push(c), |
| None => break, |
| } |
| } |
| |
| if s.is_empty() { |
| return Some('x'); |
| } |
| |
| Self::byte_to_char::<16>(&s) |
| } |
| |
| #[inline] |
| fn next_hex_digit(&mut self) -> Option<char> { |
| match self.chars.peek() { |
| Some(c) if c.is_ascii_hexdigit() => self.chars.next(), |
| _ => None, |
| } |
| } |
| |
| // Octal byte value. \o, \oo, \ooo (o = 0–7) |
| fn unescape_octal(&mut self, c: char) -> Option<char> { |
| let mut s = String::new(); |
| |
| s.push(c); |
| for _ in 0..2 { |
| match self.next_octal_digest() { |
| Some(c) => s.push(c), |
| None => break, |
| } |
| } |
| |
| Self::byte_to_char::<8>(&s) |
| } |
| |
| #[inline] |
| fn next_octal_digest(&mut self) -> Option<char> { |
| match self.chars.peek() { |
| Some(c) if c.is_digit(8) => self.chars.next(), |
| _ => None, |
| } |
| } |
| |
| // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F) |
| fn unescape_unicode_16(&mut self) -> Option<char> { |
| self.unescape_unicode::<4>() |
| } |
| |
| // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F) |
| fn unescape_unicode_32(&mut self) -> Option<char> { |
| self.unescape_unicode::<8>() |
| } |
| |
| fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> { |
| let mut s = String::new(); |
| for _ in 0..NUM { |
| s.push(self.chars.next()?); |
| } |
| match u32::from_str_radix(&s, 16) { |
| Err(_) => None, |
| Ok(n) => char::from_u32(n), |
| } |
| } |
| } |
| |
| fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> { |
| let mut unescaped = String::new(); |
| chars.next(); // consume the opening quote |
| while let Some(c) = chars.next() { |
| match c { |
| '\'' => { |
| if chars.peek() == Some(&'\'') { |
| chars.next(); |
| unescaped.push('\''); |
| } else { |
| return Ok(unescaped); |
| } |
| } |
| '\\' => match chars.peek() { |
| Some('\\') => { |
| chars.next(); |
| unescaped.push('\\'); |
| } |
| Some('+') => { |
| chars.next(); |
| unescaped.push(take_char_from_hex_digits(chars, 6)?); |
| } |
| _ => unescaped.push(take_char_from_hex_digits(chars, 4)?), |
| }, |
| _ => { |
| unescaped.push(c); |
| } |
| } |
| } |
| Err(TokenizerError { |
| message: "Unterminated unicode encoded string literal".to_string(), |
| location: chars.location(), |
| }) |
| } |
| |
| fn take_char_from_hex_digits( |
| chars: &mut State<'_>, |
| max_digits: usize, |
| ) -> Result<char, TokenizerError> { |
| let mut result = 0u32; |
| for _ in 0..max_digits { |
| let next_char = chars.next().ok_or_else(|| TokenizerError { |
| message: "Unexpected EOF while parsing hex digit in escaped unicode string." |
| .to_string(), |
| location: chars.location(), |
| })?; |
| let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError { |
| message: format!("Invalid hex digit in escaped unicode string: {next_char}"), |
| location: chars.location(), |
| })?; |
| result = result * 16 + digit; |
| } |
| char::from_u32(result).ok_or_else(|| TokenizerError { |
| message: format!("Invalid unicode character: {result:x}"), |
| location: chars.location(), |
| }) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use crate::dialect::{ |
| BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect, |
| }; |
| use crate::test_utils::{all_dialects_except, all_dialects_where}; |
| use core::fmt::Debug; |
| |
| #[test] |
| fn tokenizer_error_impl() { |
| let err = TokenizerError { |
| message: "test".into(), |
| location: Location { line: 1, column: 1 }, |
| }; |
| #[cfg(feature = "std")] |
| { |
| use std::error::Error; |
| assert!(err.source().is_none()); |
| } |
| assert_eq!(err.to_string(), "test at Line: 1, Column: 1"); |
| } |
| |
| #[test] |
| fn tokenize_select_1() { |
| let sql = String::from("SELECT 1"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_select_float() { |
| let sql = String::from("SELECT .1"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from(".1"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_clickhouse_double_equal() { |
| let sql = String::from("SELECT foo=='1'"); |
| let dialect = ClickHouseDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Word(Word { |
| value: "foo".to_string(), |
| quote_style: None, |
| keyword: Keyword::NoKeyword, |
| }), |
| Token::DoubleEq, |
| Token::SingleQuotedString("1".to_string()), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_numeric_literal_underscore() { |
| let dialect = GenericDialect {}; |
| let sql = String::from("SELECT 10_000"); |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| let tokens = tokenizer.tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("10".to_string(), false), |
| Token::make_word("_000", None), |
| ]; |
| compare(expected, tokens); |
| |
| all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to( |
| "SELECT 10_000, _10_000, 10_00_, 10___0", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("10_000".to_string(), false), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier) |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("10_00".to_string(), false), |
| Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects) |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("10".to_string(), false), |
| Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects) |
| ], |
| ); |
| } |
| |
| #[test] |
| fn tokenize_select_exponent() { |
| let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1e10"), false), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1e-10"), false), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1e+10"), false), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| Token::make_word("ea", None), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1e-10"), false), |
| Token::make_word("a", None), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1e-10"), false), |
| Token::Minus, |
| Token::Number(String::from("10"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_scalar_function() { |
| let sql = String::from("SELECT sqrt(1)"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("sqrt", None), |
| Token::LParen, |
| Token::Number(String::from("1"), false), |
| Token::RParen, |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_string_string_concat() { |
| let sql = String::from("SELECT 'a' || 'b'"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString(String::from("a")), |
| Token::Whitespace(Whitespace::Space), |
| Token::StringConcat, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString(String::from("b")), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| #[test] |
| fn tokenize_bitwise_op() { |
| let sql = String::from("SELECT one | two ^ three"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("one", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Pipe, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("two", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Caret, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("three", None), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_logical_xor() { |
| let sql = |
| String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("true"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("XOR"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("true"), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("false"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("XOR"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("false"), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("true"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("XOR"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("false"), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("false"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("XOR"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("true"), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_simple_select() { |
| let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("customer", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("WHERE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("id", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Eq, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("LIMIT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("5"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_explain_select() { |
| let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("EXPLAIN"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("customer", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("WHERE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("id", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Eq, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_explain_analyze_select() { |
| let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("EXPLAIN"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("ANALYZE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("customer", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("WHERE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("id", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Eq, |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_string_predicate() { |
| let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("customer", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("WHERE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("salary", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Neq, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString(String::from("Not Provided")), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_invalid_string() { |
| let sql = String::from("\n💝مصطفىh"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| // println!("tokens: {:#?}", tokens); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Newline), |
| Token::Char('💝'), |
| Token::make_word("مصطفىh", None), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_newline_in_string_literal() { |
| let sql = String::from("'foo\r\nbar\nbaz'"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_unterminated_string_literal() { |
| let sql = String::from("select 'foo"); |
| |
| let dialect = GenericDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| assert_eq!( |
| tokenizer.tokenize(), |
| Err(TokenizerError { |
| message: "Unterminated string literal".to_string(), |
| location: Location { line: 1, column: 8 }, |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_unterminated_string_literal_utf8() { |
| let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;"); |
| |
| let dialect = GenericDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| assert_eq!( |
| tokenizer.tokenize(), |
| Err(TokenizerError { |
| message: "Unterminated string literal".to_string(), |
| location: Location { |
| line: 1, |
| column: 35 |
| } |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_invalid_string_cols() { |
| let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| // println!("tokens: {:#?}", tokens); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Newline), |
| Token::Whitespace(Whitespace::Newline), |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("table"), |
| Token::Whitespace(Whitespace::Tab), |
| Token::Char('💝'), |
| Token::make_word("مصطفىh", None), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_tagged() { |
| let test_cases = vec![ |
| ( |
| String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"), |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(), |
| tag: Some("tag".into()), |
| }) |
| ] |
| ), |
| ( |
| String::from("SELECT $abc$x$ab$abc$"), |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "x$ab".into(), |
| tag: Some("abc".into()), |
| }) |
| ] |
| ), |
| ( |
| String::from("SELECT $abc$$abc$"), |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "".into(), |
| tag: Some("abc".into()), |
| }) |
| ] |
| ), |
| ( |
| String::from("0$abc$$abc$1"), |
| vec![ |
| Token::Number("0".into(), false), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "".into(), |
| tag: Some("abc".into()), |
| }), |
| Token::Number("1".into(), false), |
| ] |
| ), |
| ( |
| String::from("$function$abc$q$data$q$$function$"), |
| vec![ |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "abc$q$data$q$".into(), |
| tag: Some("function".into()), |
| }), |
| ] |
| ), |
| ]; |
| |
| let dialect = GenericDialect {}; |
| for (sql, expected) in test_cases { |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| compare(expected, tokens); |
| } |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_tagged_unterminated() { |
| let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$"); |
| let dialect = GenericDialect {}; |
| assert_eq!( |
| Tokenizer::new(&dialect, &sql).tokenize(), |
| Err(TokenizerError { |
| message: "Unterminated dollar-quoted, expected $".into(), |
| location: Location { |
| line: 1, |
| column: 91 |
| } |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() { |
| let sql = String::from("SELECT $abc$abc$"); |
| let dialect = GenericDialect {}; |
| assert_eq!( |
| Tokenizer::new(&dialect, &sql).tokenize(), |
| Err(TokenizerError { |
| message: "Unterminated dollar-quoted, expected $".into(), |
| location: Location { |
| line: 1, |
| column: 17 |
| } |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_dollar_placeholder() { |
| let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC"); |
| let dialect = SQLiteDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| assert_eq!( |
| tokens, |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Placeholder("$$".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Placeholder("$$ABC$$".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Placeholder("$ABC$".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::Placeholder("$ABC".into()), |
| ] |
| ); |
| } |
| |
| #[test] |
| fn tokenize_nested_dollar_quoted_strings() { |
| let sql = String::from("SELECT $tag$dollar $nested$ string$tag$"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "dollar $nested$ string".into(), |
| tag: Some("tag".into()), |
| }), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_untagged_empty() { |
| let sql = String::from("SELECT $$$$"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "".into(), |
| tag: None, |
| }), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_untagged() { |
| let sql = |
| String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::DollarQuotedString(DollarQuotedString { |
| value: "within dollar '$' quoted strings have $tags like this$ ".into(), |
| tag: None, |
| }), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_dollar_quoted_string_untagged_unterminated() { |
| let sql = String::from( |
| "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$", |
| ); |
| let dialect = GenericDialect {}; |
| assert_eq!( |
| Tokenizer::new(&dialect, &sql).tokenize(), |
| Err(TokenizerError { |
| message: "Unterminated dollar-quoted string".into(), |
| location: Location { |
| line: 1, |
| column: 86 |
| } |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_right_arrow() { |
| let sql = String::from("FUNCTION(key=>value)"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_word("FUNCTION", None), |
| Token::LParen, |
| Token::make_word("key", None), |
| Token::RArrow, |
| Token::make_word("value", None), |
| Token::RParen, |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_is_null() { |
| let sql = String::from("a IS NULL"); |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_word("a", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("IS"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("NULL"), |
| ]; |
| |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_comment() { |
| let test_cases = vec![ |
| ( |
| String::from("0--this is a comment\n1"), |
| vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "this is a comment\n".to_string(), |
| }), |
| Token::Number("1".to_string(), false), |
| ], |
| ), |
| ( |
| String::from("0--this is a comment\r1"), |
| vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "this is a comment\r1".to_string(), |
| }), |
| ], |
| ), |
| ( |
| String::from("0--this is a comment\r\n1"), |
| vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "this is a comment\r\n".to_string(), |
| }), |
| Token::Number("1".to_string(), false), |
| ], |
| ), |
| ]; |
| |
| let dialect = GenericDialect {}; |
| |
| for (sql, expected) in test_cases { |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| compare(expected, tokens); |
| } |
| } |
| |
| #[test] |
| fn tokenize_comment_postgres() { |
| let sql = String::from("1--\r0"); |
| |
| let dialect = PostgreSqlDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::Number("1".to_string(), false), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "\r".to_string(), |
| }), |
| Token::Number("0".to_string(), false), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_comment_at_eof() { |
| let sql = String::from("--this is a comment"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "this is a comment".to_string(), |
| })]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_multiline_comment() { |
| let sql = String::from("0/*multi-line\n* /comment*/1"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment( |
| "multi-line\n* /comment".to_string(), |
| )), |
| Token::Number("1".to_string(), false), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_nested_multiline_comment() { |
| all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( |
| "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1", |
| vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment( |
| "multi-line\n* \n/* comment \n /*comment*/*/ ".into(), |
| )), |
| Token::Whitespace(Whitespace::Space), |
| Token::Div, |
| Token::Word(Word { |
| value: "comment".to_string(), |
| quote_style: None, |
| keyword: Keyword::COMMENT, |
| }), |
| Token::Mul, |
| Token::Div, |
| Token::Number("1".to_string(), false), |
| ], |
| ); |
| |
| all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( |
| "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1", |
| vec![ |
| Token::Number("0".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment( |
| "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(), |
| )), |
| Token::Number("1".to_string(), false), |
| ], |
| ); |
| |
| all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( |
| "SELECT 1/* a /* b */ c */0", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("1".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())), |
| Token::Number("0".to_string(), false), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn tokenize_nested_multiline_comment_empty() { |
| all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to( |
| "select 1/*/**/*/0", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("1".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())), |
| Token::Number("0".to_string(), false), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn tokenize_nested_comments_if_not_supported() { |
| all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to( |
| "SELECT 1/*/* nested comment */*/0", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number("1".to_string(), false), |
| Token::Whitespace(Whitespace::MultiLineComment( |
| "/* nested comment ".to_string(), |
| )), |
| Token::Mul, |
| Token::Div, |
| Token::Number("0".to_string(), false), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn tokenize_multiline_comment_with_even_asterisks() { |
| let sql = String::from("\n/** Comment **/\n"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Newline), |
| Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())), |
| Token::Whitespace(Whitespace::Newline), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_unicode_whitespace() { |
| let sql = String::from(" \u{2003}\n"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::Newline), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_mismatched_quotes() { |
| let sql = String::from("\"foo"); |
| |
| let dialect = GenericDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, &sql); |
| assert_eq!( |
| tokenizer.tokenize(), |
| Err(TokenizerError { |
| message: "Expected close delimiter '\"' before EOF.".to_string(), |
| location: Location { line: 1, column: 1 }, |
| }) |
| ); |
| } |
| |
| #[test] |
| fn tokenize_newlines() { |
| let sql = String::from("line1\nline2\rline3\r\nline4\r"); |
| |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_word("line1", None), |
| Token::Whitespace(Whitespace::Newline), |
| Token::make_word("line2", None), |
| Token::Whitespace(Whitespace::Newline), |
| Token::make_word("line3", None), |
| Token::Whitespace(Whitespace::Newline), |
| Token::make_word("line4", None), |
| Token::Whitespace(Whitespace::Newline), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_mssql_top() { |
| let sql = "SELECT TOP 5 [bar] FROM foo"; |
| let dialect = MsSqlDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("TOP"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("5"), false), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("bar", Some('[')), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("foo", None), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_pg_regex_match() { |
| let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'"; |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::Tilde, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("^a".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::TildeAsterisk, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("^a".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::ExclamationMarkTilde, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("^a".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::ExclamationMarkTildeAsterisk, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("^a".into()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_pg_like_match() { |
| let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'"; |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::DoubleTilde, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("_a%".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::DoubleTildeAsterisk, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("_a%".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::ExclamationMarkDoubleTilde, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("_a%".into()), |
| Token::Comma, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("col", None), |
| Token::Whitespace(Whitespace::Space), |
| Token::ExclamationMarkDoubleTildeAsterisk, |
| Token::Whitespace(Whitespace::Space), |
| Token::SingleQuotedString("_a%".into()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_quoted_identifier() { |
| let sql = r#" "a "" b" "a """ "c """"" "#; |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"a " b"#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"a ""#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"c """#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_snowflake_div() { |
| let sql = r#"field/1000"#; |
| let dialect = SnowflakeDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_word(r#"field"#, None), |
| Token::Div, |
| Token::Number("1000".to_string(), false), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_quoted_identifier_with_no_escape() { |
| let sql = r#" "a "" b" "a """ "c """"" "#; |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql) |
| .with_unescape(false) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![ |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"a "" b"#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"a """#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word(r#"c """""#, Some('"')), |
| Token::Whitespace(Whitespace::Space), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_with_location() { |
| let sql = "SELECT a,\n b"; |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql) |
| .tokenize_with_location() |
| .unwrap(); |
| let expected = vec![ |
| TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()), |
| TokenWithSpan::at( |
| Token::Whitespace(Whitespace::Space), |
| (1, 7).into(), |
| (1, 8).into(), |
| ), |
| TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()), |
| TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()), |
| TokenWithSpan::at( |
| Token::Whitespace(Whitespace::Newline), |
| (1, 10).into(), |
| (2, 1).into(), |
| ), |
| TokenWithSpan::at( |
| Token::Whitespace(Whitespace::Space), |
| (2, 1).into(), |
| (2, 2).into(), |
| ), |
| TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) { |
| //println!("------------------------------"); |
| //println!("tokens = {:?}", actual); |
| //println!("expected = {:?}", expected); |
| //println!("------------------------------"); |
| assert_eq!(expected, actual); |
| } |
| |
| fn check_unescape(s: &str, expected: Option<&str>) { |
| let s = format!("'{s}'"); |
| let mut state = State { |
| peekable: s.chars().peekable(), |
| line: 0, |
| col: 0, |
| }; |
| |
| assert_eq!( |
| unescape_single_quoted_string(&mut state), |
| expected.map(|s| s.to_string()) |
| ); |
| } |
| |
| #[test] |
| fn test_unescape() { |
| check_unescape(r"\b", Some("\u{0008}")); |
| check_unescape(r"\f", Some("\u{000C}")); |
| check_unescape(r"\t", Some("\t")); |
| check_unescape(r"\r\n", Some("\r\n")); |
| check_unescape(r"\/", Some("/")); |
| check_unescape(r"/", Some("/")); |
| check_unescape(r"\\", Some("\\")); |
| |
| // 16 and 32-bit hexadecimal Unicode character value |
| check_unescape(r"\u0001", Some("\u{0001}")); |
| check_unescape(r"\u4c91", Some("\u{4c91}")); |
| check_unescape(r"\u4c916", Some("\u{4c91}6")); |
| check_unescape(r"\u4c", None); |
| check_unescape(r"\u0000", None); |
| check_unescape(r"\U0010FFFF", Some("\u{10FFFF}")); |
| check_unescape(r"\U00110000", None); |
| check_unescape(r"\U00000000", None); |
| check_unescape(r"\u", None); |
| check_unescape(r"\U", None); |
| check_unescape(r"\U1010FFFF", None); |
| |
| // hexadecimal byte value |
| check_unescape(r"\x4B", Some("\u{004b}")); |
| check_unescape(r"\x4", Some("\u{0004}")); |
| check_unescape(r"\x4L", Some("\u{0004}L")); |
| check_unescape(r"\x", Some("x")); |
| check_unescape(r"\xP", Some("xP")); |
| check_unescape(r"\x0", None); |
| check_unescape(r"\xCAD", None); |
| check_unescape(r"\xA9", None); |
| |
| // octal byte value |
| check_unescape(r"\1", Some("\u{0001}")); |
| check_unescape(r"\12", Some("\u{000a}")); |
| check_unescape(r"\123", Some("\u{0053}")); |
| check_unescape(r"\1232", Some("\u{0053}2")); |
| check_unescape(r"\4", Some("\u{0004}")); |
| check_unescape(r"\45", Some("\u{0025}")); |
| check_unescape(r"\450", Some("\u{0028}")); |
| check_unescape(r"\603", None); |
| check_unescape(r"\0", None); |
| check_unescape(r"\080", None); |
| |
| // others |
| check_unescape(r"\9", Some("9")); |
| check_unescape(r"''", Some("'")); |
| check_unescape( |
| r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232", |
| Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"), |
| ); |
| check_unescape(r"Hello\0", None); |
| check_unescape(r"Hello\xCADRust", None); |
| } |
| |
| #[test] |
| fn tokenize_numeric_prefix_trait() { |
| #[derive(Debug)] |
| struct NumericPrefixDialect; |
| |
| impl Dialect for NumericPrefixDialect { |
| fn is_identifier_start(&self, ch: char) -> bool { |
| ch.is_ascii_lowercase() |
| || ch.is_ascii_uppercase() |
| || ch.is_ascii_digit() |
| || ch == '$' |
| } |
| |
| fn is_identifier_part(&self, ch: char) -> bool { |
| ch.is_ascii_lowercase() |
| || ch.is_ascii_uppercase() |
| || ch.is_ascii_digit() |
| || ch == '_' |
| || ch == '$' |
| || ch == '{' |
| || ch == '}' |
| } |
| |
| fn supports_numeric_prefix(&self) -> bool { |
| true |
| } |
| } |
| |
| tokenize_numeric_prefix_inner(&NumericPrefixDialect {}); |
| tokenize_numeric_prefix_inner(&HiveDialect {}); |
| tokenize_numeric_prefix_inner(&MySqlDialect {}); |
| } |
| |
| fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) { |
| let sql = r#"SELECT * FROM 1"#; |
| let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Mul, |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Number(String::from("1"), false), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn tokenize_quoted_string_escape() { |
| let dialect = SnowflakeDialect {}; |
| for (sql, expected, expected_unescaped) in [ |
| (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#), |
| (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#), |
| (r#"'\\'"#, r#"\\"#, r#"\"#), |
| ( |
| r#"'\0\a\b\f\n\r\t\Z'"#, |
| r#"\0\a\b\f\n\r\t\Z"#, |
| "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}", |
| ), |
| (r#"'\"'"#, r#"\""#, "\""), |
| (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#), |
| (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#), |
| (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#), |
| (r#"'\q'"#, r#"\q"#, r#"q"#), |
| (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#), |
| (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#), |
| ] { |
| let tokens = Tokenizer::new(&dialect, sql) |
| .with_unescape(false) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![Token::SingleQuotedString(expected.to_string())]; |
| compare(expected, tokens); |
| |
| let tokens = Tokenizer::new(&dialect, sql) |
| .with_unescape(true) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())]; |
| compare(expected, tokens); |
| } |
| |
| for sql in [r#"'\'"#, r#"'ab\'"#] { |
| let mut tokenizer = Tokenizer::new(&dialect, sql); |
| assert_eq!( |
| "Unterminated string literal", |
| tokenizer.tokenize().unwrap_err().message.as_str(), |
| ); |
| } |
| |
| // Non-escape dialect |
| for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] { |
| let dialect = GenericDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| |
| let expected = vec![Token::SingleQuotedString(expected.to_string())]; |
| |
| compare(expected, tokens); |
| } |
| |
| // MySQL special case for LIKE escapes |
| for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] { |
| let dialect = MySqlDialect {}; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| |
| let expected = vec![Token::SingleQuotedString(expected.to_string())]; |
| |
| compare(expected, tokens); |
| } |
| } |
| |
| #[test] |
| fn tokenize_triple_quoted_string() { |
| fn check<F>( |
| q: char, // The quote character to test |
| r: char, // An alternate quote character. |
| quote_token: F, |
| ) where |
| F: Fn(String) -> Token, |
| { |
| let dialect = BigQueryDialect {}; |
| |
| for (sql, expected, expected_unescaped) in [ |
| // Empty string |
| (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()), |
| // Should not count escaped quote as end of string. |
| ( |
| format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#), |
| format!(r#"ab{q}{q}\{q}{q}cd"#), |
| format!(r#"ab{q}{q}{q}{q}cd"#), |
| ), |
| // Simple string |
| ( |
| format!(r#"{q}{q}{q}abc{q}{q}{q}"#), |
| "abc".into(), |
| "abc".into(), |
| ), |
| // Mix single-double quotes unescaped. |
| ( |
| format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#), |
| format!("ab{r}{r}{r}c{r}def{r}{r}{r}"), |
| format!("ab{r}{r}{r}c{r}def{r}{r}{r}"), |
| ), |
| // Escaped quote. |
| ( |
| format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#), |
| format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#), |
| format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#), |
| ), |
| // backslash-escaped quote characters. |
| ( |
| format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#), |
| r#"a\'\'b\'c\'d"#.into(), |
| r#"a''b'c'd"#.into(), |
| ), |
| // backslash-escaped characters |
| ( |
| format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#), |
| r#"abc\0\n\rdef"#.into(), |
| "abc\0\n\rdef".into(), |
| ), |
| ] { |
| let tokens = Tokenizer::new(&dialect, sql.as_str()) |
| .with_unescape(false) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![quote_token(expected.to_string())]; |
| compare(expected, tokens); |
| |
| let tokens = Tokenizer::new(&dialect, sql.as_str()) |
| .with_unescape(true) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![quote_token(expected_unescaped.to_string())]; |
| compare(expected, tokens); |
| } |
| |
| for sql in [ |
| format!(r#"{q}{q}{q}{q}{q}\{q}"#), |
| format!(r#"{q}{q}{q}abc{q}{q}\{q}"#), |
| format!(r#"{q}{q}{q}{q}"#), |
| format!(r#"{q}{q}{q}{r}{r}"#), |
| format!(r#"{q}{q}{q}abc{q}"#), |
| format!(r#"{q}{q}{q}abc{q}{q}"#), |
| format!(r#"{q}{q}{q}abc"#), |
| ] { |
| let dialect = BigQueryDialect {}; |
| let mut tokenizer = Tokenizer::new(&dialect, sql.as_str()); |
| assert_eq!( |
| "Unterminated string literal", |
| tokenizer.tokenize().unwrap_err().message.as_str(), |
| ); |
| } |
| } |
| |
| check('"', '\'', Token::TripleDoubleQuotedString); |
| |
| check('\'', '"', Token::TripleSingleQuotedString); |
| |
| let dialect = BigQueryDialect {}; |
| |
| let sql = r#"""''"#; |
| let tokens = Tokenizer::new(&dialect, sql) |
| .with_unescape(true) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![ |
| Token::DoubleQuotedString("".to_string()), |
| Token::SingleQuotedString("".to_string()), |
| ]; |
| compare(expected, tokens); |
| |
| let sql = r#"''"""#; |
| let tokens = Tokenizer::new(&dialect, sql) |
| .with_unescape(true) |
| .tokenize() |
| .unwrap(); |
| let expected = vec![ |
| Token::SingleQuotedString("".to_string()), |
| Token::DoubleQuotedString("".to_string()), |
| ]; |
| compare(expected, tokens); |
| |
| // Non-triple quoted string dialect |
| let dialect = SnowflakeDialect {}; |
| let sql = r#"''''''"#; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![Token::SingleQuotedString("''".to_string())]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn test_mysql_users_grantees() { |
| let dialect = MySqlDialect {}; |
| |
| let sql = "CREATE USER `root`@`%`"; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("CREATE"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("USER"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("root", Some('`')), |
| Token::AtSign, |
| Token::make_word("%", Some('`')), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn test_postgres_abs_without_space_and_string_literal() { |
| let dialect = MySqlDialect {}; |
| |
| let sql = "SELECT @'1'"; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::AtSign, |
| Token::SingleQuotedString("1".to_string()), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn test_postgres_abs_without_space_and_quoted_column() { |
| let dialect = MySqlDialect {}; |
| |
| let sql = r#"SELECT @"bar" FROM foo"#; |
| let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::AtSign, |
| Token::DoubleQuotedString("bar".to_string()), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_keyword("FROM"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("foo", None), |
| ]; |
| compare(expected, tokens); |
| } |
| |
| #[test] |
| fn test_national_strings_backslash_escape_not_supported() { |
| all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape()) |
| .tokenizes_to( |
| "select n'''''\\'", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::NationalStringLiteral("''\\".to_string()), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_national_strings_backslash_escape_supported() { |
| all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape()) |
| .tokenizes_to( |
| "select n'''''\\''", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::NationalStringLiteral("'''".to_string()), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_string_escape_constant_not_supported() { |
| all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( |
| "select e'...'", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("e", None), |
| Token::SingleQuotedString("...".to_string()), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( |
| "select E'...'", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::make_word("E", None), |
| Token::SingleQuotedString("...".to_string()), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_string_escape_constant_supported() { |
| all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( |
| "select e'\\''", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::EscapedStringLiteral("'".to_string()), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( |
| "select E'\\''", |
| vec![ |
| Token::make_keyword("select"), |
| Token::Whitespace(Whitespace::Space), |
| Token::EscapedStringLiteral("'".to_string()), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_whitespace_required_after_single_line_comment() { |
| all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT --'abc'", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Minus, |
| Token::Minus, |
| Token::SingleQuotedString("abc".to_string()), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT -- 'abc'", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: " 'abc'".to_string(), |
| }), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT --", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Minus, |
| Token::Minus, |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_whitespace_not_required_after_single_line_comment() { |
| all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT --'abc'", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "'abc'".to_string(), |
| }), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT -- 'abc'", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: " 'abc'".to_string(), |
| }), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace()) |
| .tokenizes_to( |
| "SELECT --", |
| vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Whitespace(Whitespace::SingleLineComment { |
| prefix: "--".to_string(), |
| comment: "".to_string(), |
| }), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn test_tokenize_identifiers_numeric_prefix() { |
| all_dialects_where(|dialect| dialect.supports_numeric_prefix()) |
| .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]); |
| |
| all_dialects_where(|dialect| dialect.supports_numeric_prefix()) |
| .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]); |
| |
| all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( |
| "t.12e34", |
| vec![ |
| Token::make_word("t", None), |
| Token::Period, |
| Token::make_word("12e34", None), |
| ], |
| ); |
| |
| all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to( |
| "t.1two3", |
| vec![ |
| Token::make_word("t", None), |
| Token::Period, |
| Token::make_word("1two3", None), |
| ], |
| ); |
| } |
| |
| #[test] |
| fn tokenize_period_underscore() { |
| let sql = String::from("SELECT table._col"); |
| // a dialect that supports underscores in numeric literals |
| let dialect = PostgreSqlDialect {}; |
| let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); |
| |
| let expected = vec![ |
| Token::make_keyword("SELECT"), |
| Token::Whitespace(Whitespace::Space), |
| Token::Word(Word { |
| value: "table".to_string(), |
| quote_style: None, |
| keyword: Keyword::TABLE, |
| }), |
| Token::Period, |
| Token::Word(Word { |
| value: "_col".to_string(), |
| quote_style: None, |
| keyword: Keyword::NoKeyword, |
| }), |
| ]; |
| |
| compare(expected, tokens); |
| |
| let sql = String::from("SELECT ._123"); |
| if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { |
| panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); |
| } |
| |
| let sql = String::from("SELECT ._abc"); |
| if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { |
| panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); |
| } |
| } |
| } |