Reduce string copies cow (#2075)

commit: b098976cabdc77b881fcce27a494b7041623c809 [log] [tgz]
author: eyalleshem <38852709+eyalleshem@users.noreply.github.com> Thu Dec 04 10:45:15 2025 +0200
committer: GitHub <noreply@github.com> Thu Dec 04 09:45:15 2025 +0100
tree: f16154726354d7c5b10c075fa7a9190d8782310d
parent: c8acf9f52d829ce808c5d59c0a1e962e762d6dc3 [diff]
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 1ca5031..745c735 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs

@@ -23,12 +23,15 @@
 
 #[cfg(not(feature = "std"))]
 use alloc::{
-    borrow::ToOwned,
+    borrow::{Cow, ToOwned},
     format,
     string::{String, ToString},
     vec,
     vec::Vec,
 };
+#[cfg(feature = "std")]
+use std::borrow::Cow;
+
 use core::iter::Peekable;
 use core::num::NonZeroU8;
 use core::str::Chars;
@@ -934,7 +937,7 @@
         chars: &mut State<'a>,
     ) -> Result<Option<Token>, TokenizerError> {
         chars.next(); // consume the first char
-        let word = self.tokenize_word(consumed_byte_len, chars);
+        let word = self.tokenize_word(consumed_byte_len, chars)?;
 
         // TODO: implement parsing of exponent here
         if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
@@ -1008,7 +1011,7 @@
                         }
                         _ => {
                             // regular identifier starting with an "b" or "B"
-                            let s = self.tokenize_word(b.len_utf8(), chars);
+                            let s = self.tokenize_word(b.len_utf8(), chars)?;
                             Ok(Some(Token::make_word(&s, None)))
                         }
                     }
@@ -1035,7 +1038,7 @@
                             ),
                         _ => {
                             // regular identifier starting with an "r" or "R"
-                            let s = self.tokenize_word(b.len_utf8(), chars);
+                            let s = self.tokenize_word(b.len_utf8(), chars)?;
                             Ok(Some(Token::make_word(&s, None)))
                         }
                     }
@@ -1054,7 +1057,7 @@
                         }
                         _ => {
                             // regular identifier starting with an "N"
-                            let s = self.tokenize_word(n.len_utf8(), chars);
+                            let s = self.tokenize_word(n.len_utf8(), chars)?;
                             Ok(Some(Token::make_word(&s, None)))
                         }
                     }
@@ -1071,7 +1074,7 @@
                         }
                         _ => {
                             // regular identifier starting with an "E" or "e"
-                            let s = self.tokenize_word(x.len_utf8(), chars);
+                            let s = self.tokenize_word(x.len_utf8(), chars)?;
                             Ok(Some(Token::make_word(&s, None)))
                         }
                     }
@@ -1090,7 +1093,7 @@
                         }
                     }
                     // regular identifier starting with an "U" or "u"
-                    let s = self.tokenize_word(x.len_utf8(), chars);
+                    let s = self.tokenize_word(x.len_utf8(), chars)?;
                     Ok(Some(Token::make_word(&s, None)))
                 }
                 // The spec only allows an uppercase 'X' to introduce a hex
@@ -1105,7 +1108,7 @@
                         }
                         _ => {
                             // regular identifier starting with an "X"
-                            let s = self.tokenize_word(x.len_utf8(), chars);
+                            let s = self.tokenize_word(x.len_utf8(), chars)?;
                             Ok(Some(Token::make_word(&s, None)))
                         }
                     }
@@ -1351,7 +1354,7 @@
 
                             if is_comment {
                                 chars.next(); // consume second '-'
-                                let comment = self.tokenize_single_line_comment(chars);
+                                let comment = self.tokenize_single_line_comment(chars)?;
                                 return Ok(Some(Token::Whitespace(
                                     Whitespace::SingleLineComment {
                                         prefix: "--".to_owned(),
@@ -1382,7 +1385,7 @@
                         }
                         Some('/') if dialect_of!(self is SnowflakeDialect) => {
                             chars.next(); // consume the second '/', starting a snowflake single-line comment
-                            let comment = self.tokenize_single_line_comment(chars);
+                            let comment = self.tokenize_single_line_comment(chars)?;
                             Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
                                 prefix: "//".to_owned(),
                                 comment,
@@ -1588,7 +1591,7 @@
                 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
                 {
                     chars.next(); // consume the '#', starting a snowflake single-line comment
-                    let comment = self.tokenize_single_line_comment(chars);
+                    let comment = self.tokenize_single_line_comment(chars)?;
                     Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
                         prefix: "#".to_owned(),
                         comment,
@@ -1783,80 +1786,133 @@
     }
 
     /// Tokenize dollar preceded value (i.e: a string/placeholder)
-    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
-        let mut s = String::new();
-        let mut value = String::new();
+    fn tokenize_dollar_preceded_value(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<Token, TokenizerError> {
+        let starting_loc = chars.location();
 
-        chars.next();
+        // Validate we're at a $ before consuming
+        if chars.peek() != Some(&'$') {
+            return self.tokenizer_error(starting_loc, "Expected $ character");
+        }
+        chars.next(); // consume first $
 
-        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
+        // Case 1: $$text$$ (untagged dollar-quoted string)
         if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
-            chars.next();
+            let (value, tag) = self.tokenize_dollar_quoted_string_borrowed(chars, None)?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
 
-            let mut is_terminated = false;
-            let mut prev: Option<char> = None;
+        // If it's not $$ we have 2 options :
+        //   Case 2: $tag$text$tag$ (tagged dollar-quoted string) if dialect supports it
+        //   Case 3: $placeholder (e.g., $1, $name)
+        let tag_start = chars.byte_pos;
+        let _tag_slice = peeking_take_while_ref(chars, |ch| {
+            ch.is_alphanumeric()
+                || ch == '_'
+                || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
+        });
+        let tag_end = chars.byte_pos;
 
-            while let Some(&ch) = chars.peek() {
-                if prev == Some('$') {
-                    if ch == '$' {
-                        chars.next();
-                        is_terminated = true;
-                        break;
-                    } else {
-                        s.push('$');
-                        s.push(ch);
+        // Case 2: $tag$text$tag$ (tagged dollar-quoted string)
+        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
+            let tag_value = self.safe_slice(chars.source, tag_start, tag_end, starting_loc)?;
+            let (value, tag) =
+                self.tokenize_dollar_quoted_string_borrowed(chars, Some(tag_value))?;
+            return Ok(Token::DollarQuotedString(DollarQuotedString {
+                value: value.into_owned(),
+                tag: tag.map(|t| t.into_owned()),
+            }));
+        }
+
+        // Case 3: $placeholder (e.g., $1, $name)
+        let tag_value = self.safe_slice(chars.source, tag_start, tag_end, starting_loc)?;
+        Ok(Token::Placeholder(format!("${}", tag_value)))
+    }
+
+    /// Tokenize a dollar-quoted string ($$text$$ or $tag$text$tag$), returning borrowed slices.
+    /// tag_prefix: None for $$, Some("tag") for $tag$
+    /// Returns (value: Cow<'a, str>, tag: Option<Cow<'a, str>>)
+    fn tokenize_dollar_quoted_string_borrowed(
+        &self,
+        chars: &mut State<'a>,
+        tag_prefix: Option<&'a str>,
+    ) -> Result<(Cow<'a, str>, Option<Cow<'a, str>>), TokenizerError> {
+        let starting_loc = chars.location();
+
+        // Validate we're at a $ before consuming
+        if chars.peek() != Some(&'$') {
+            return self.tokenizer_error(starting_loc, "Expected $ for dollar-quoted string");
+        }
+        chars.next(); // consume $ after tag (or second $ for $$)
+        let content_start = chars.byte_pos;
+
+        match tag_prefix {
+            None => {
+                // Case: $$text$$
+                let mut prev: Option<char> = None;
+
+                while let Some(&ch) = chars.peek() {
+                    if prev == Some('$') && ch == '$' {
+                        chars.next(); // consume final $
+                                      // content_end is before the first $ of $$
+                        let content_end = chars.byte_pos - 2;
+                        let value = self.safe_slice(
+                            chars.source,
+                            content_start,
+                            content_end,
+                            starting_loc,
+                        )?;
+                        return Ok((Cow::Borrowed(value), None));
                     }
-                } else if ch != '$' {
-                    s.push(ch);
+
+                    prev = Some(ch);
+                    chars.next();
                 }
 
-                prev = Some(ch);
-                chars.next();
-            }
-
-            return if chars.peek().is_none() && !is_terminated {
                 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
-            } else {
-                Ok(Token::DollarQuotedString(DollarQuotedString {
-                    value: s,
-                    tag: None,
-                }))
-            };
-        } else {
-            value.push_str(&peeking_take_while(chars, |ch| {
-                ch.is_alphanumeric()
-                    || ch == '_'
-                    // Allow $ as a placeholder character if the dialect supports it
-                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
-            }));
+            }
+            Some(tag) => {
+                // Case: $tag$text$tag$
+                let end_delimiter = format!("${}$", tag);
 
-            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
-            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
-                chars.next();
-
-                let mut temp = String::new();
-                let end_delimiter = format!("${value}$");
-
+                // Scan for the end delimiter
+                let buffer_start = content_start;
                 loop {
                     match chars.next() {
-                        Some(ch) => {
-                            temp.push(ch);
+                        Some(_) => {
+                            let current_pos = chars.byte_pos;
+                            let buffer = self.safe_slice(
+                                chars.source,
+                                buffer_start,
+                                current_pos,
+                                starting_loc,
+                            )?;
 
-                            if temp.ends_with(&end_delimiter) {
-                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
-                                    s.push_str(temp);
-                                }
-                                break;
+                            if buffer.ends_with(&end_delimiter) {
+                                // Found the end delimiter
+                                let content_end = current_pos - end_delimiter.len();
+                                let value = self.safe_slice(
+                                    chars.source,
+                                    content_start,
+                                    content_end,
+                                    starting_loc,
+                                )?;
+                                return Ok((
+                                    Cow::Borrowed(value),
+                                    if tag.is_empty() {
+                                        None
+                                    } else {
+                                        Some(Cow::Borrowed(tag))
+                                    },
+                                ));
                             }
                         }
                         None => {
-                            if temp.ends_with(&end_delimiter) {
-                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
-                                    s.push_str(temp);
-                                }
-                                break;
-                            }
-
                             return self.tokenizer_error(
                                 chars.location(),
                                 "Unterminated dollar-quoted, expected $",
@@ -1864,15 +1920,23 @@
                         }
                     }
                 }
-            } else {
-                return Ok(Token::Placeholder(String::from("$") + &value));
             }
         }
+    }
 
-        Ok(Token::DollarQuotedString(DollarQuotedString {
-            value: s,
-            tag: if value.is_empty() { None } else { Some(value) },
-        }))
+    /// Helper function to safely slice a string with bounds validation
+    fn safe_slice<'b>(
+        &self,
+        source: &'b str,
+        start: usize,
+        end: usize,
+        error_loc: Location,
+    ) -> Result<&'b str, TokenizerError> {
+        // Validate slice bounds
+        if end < start || end > source.len() {
+            return self.tokenizer_error(error_loc, "Invalid string slice bounds");
+        }
+        Ok(&source[start..end])
     }
 
     fn tokenizer_error<R>(
@@ -1887,63 +1951,90 @@
     }
 
     // Consume characters until newline
-    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
-        let mut comment = peeking_take_while(chars, |ch| match ch {
+    fn tokenize_single_line_comment(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<String, TokenizerError> {
+        Ok(self
+            .tokenize_single_line_comment_borrowed(chars)?
+            .to_string())
+    }
+
+    /// Tokenize a single-line comment, returning a borrowed slice.
+    /// Returns a slice that includes the terminating newline character.
+    fn tokenize_single_line_comment_borrowed(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<&'a str, TokenizerError> {
+        let start_pos = chars.byte_pos;
+        let error_loc = chars.location();
+
+        // Consume until newline
+        peeking_take_while_ref(chars, |ch| match ch {
             '\n' => false,                                           // Always stop at \n
             '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
             _ => true, // Keep consuming for other characters
         });
 
+        // Consume the newline character
         if let Some(ch) = chars.next() {
             assert!(ch == '\n' || ch == '\r');
-            comment.push(ch);
         }
 
-        comment
+        // Return slice including the newline
+        self.safe_slice(chars.source, start_pos, chars.byte_pos, error_loc)
     }
 
     /// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
     /// `consumed_byte_len` is the byte length of the consumed character(s).
-    fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
+    fn tokenize_word(
+        &self,
+        consumed_byte_len: usize,
+        chars: &mut State<'a>,
+    ) -> Result<String, TokenizerError> {
+        let error_loc = chars.location();
+
         // Overflow check: ensure we can safely subtract
         if consumed_byte_len > chars.byte_pos {
-            return String::new();
+            return self.tokenizer_error(error_loc, "Invalid byte position in tokenize_word");
         }
 
         // Calculate where the first character started
         let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
 
         // Use the zero-copy version and convert to String
-        self.tokenize_word_borrowed(first_char_byte_pos, chars)
-            .to_string()
+        Ok(self
+            .tokenize_word_borrowed(first_char_byte_pos, chars)?
+            .to_string())
     }
 
     /// Tokenize an identifier or keyword, returning a borrowed slice when possible.
     /// The first character position must be provided (before it was consumed).
     /// Returns a slice with the same lifetime as the State's source.
-    fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
+    fn tokenize_word_borrowed(
+        &self,
+        first_char_byte_pos: usize,
+        chars: &mut State<'a>,
+    ) -> Result<&'a str, TokenizerError> {
+        let error_loc = chars.location();
+
         // Consume the rest of the word
         peeking_take_while_ref(chars, |ch| self.dialect.is_identifier_part(ch));
 
-        // Boundary check: ensure first_char_byte_pos is valid
-        if first_char_byte_pos > chars.byte_pos || first_char_byte_pos > chars.source.len() {
-            return "";
-        }
-
-        // Return a slice from the first char to the current position
-        &chars.source[first_char_byte_pos..chars.byte_pos]
+        // Return a slice from the first char to the current position using safe_slice
+        self.safe_slice(chars.source, first_char_byte_pos, chars.byte_pos, error_loc)
     }
 
     /// Read a quoted identifier
     fn tokenize_quoted_identifier(
         &self,
         quote_start: char,
-        chars: &mut State,
+        chars: &mut State<'a>,
     ) -> Result<String, TokenizerError> {
         let error_loc = chars.location();
         chars.next(); // consume the opening quote
         let quote_end = Word::matching_end_quote(quote_start);
-        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
+        let (s, last_char) = self.parse_quoted_ident(chars, quote_end)?;
 
         if last_char == Some(quote_end) {
             Ok(s)
@@ -2152,9 +2243,21 @@
 
     fn tokenize_multiline_comment(
         &self,
-        chars: &mut State,
+        chars: &mut State<'a>,
     ) -> Result<Option<Token>, TokenizerError> {
-        let mut s = String::new();
+        let s = self.tokenize_multiline_comment_borrowed(chars)?;
+        Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(
+            s.to_string(),
+        ))))
+    }
+
+    /// Tokenize a multi-line comment, returning a borrowed slice.
+    /// Returns a slice that excludes the opening `/*` (already consumed) and the final closing `*/`.
+    fn tokenize_multiline_comment_borrowed(
+        &self,
+        chars: &mut State<'a>,
+    ) -> Result<&'a str, TokenizerError> {
+        let start_pos = chars.byte_pos;
         let mut nested = 1;
         let supports_nested_comments = self.dialect.supports_nested_comments();
 
@@ -2162,24 +2265,22 @@
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
                     chars.next(); // consume the '*'
-                    s.push('/');
-                    s.push('*');
                     nested += 1;
                 }
                 Some('*') if matches!(chars.peek(), Some('/')) => {
                     chars.next(); // consume the '/'
                     nested -= 1;
                     if nested == 0 {
-                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
+                        // We've consumed the final */, so exclude it from the slice
+                        let end_pos = chars.byte_pos - 2; // Subtract 2 bytes for '*' and '/'
+                        return self.safe_slice(chars.source, start_pos, end_pos, chars.location());
                     }
-                    s.push('*');
-                    s.push('/');
                 }
-                Some(ch) => {
-                    s.push(ch);
+                Some(_) => {
+                    // Just consume the character, don't need to push to string
                 }
                 None => {
-                    break self.tokenizer_error(
+                    return self.tokenizer_error(
                         chars.location(),
                         "Unexpected EOF while in a multi-line comment",
                     );
@@ -2188,27 +2289,71 @@
         }
     }
 
-    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
+    fn parse_quoted_ident(
+        &self,
+        chars: &mut State<'a>,
+        quote_end: char,
+    ) -> Result<(String, Option<char>), TokenizerError> {
+        let (cow, last_char) = self.parse_quoted_ident_borrowed(chars, quote_end)?;
+        Ok((cow.into_owned(), last_char))
+    }
+
+    /// Parse quoted identifier, returning borrowed slice when possible.
+    /// Returns `(Cow<'a, str>, Option<char>)` where the `Option<char>` is the closing quote.
+    fn parse_quoted_ident_borrowed(
+        &self,
+        chars: &mut State<'a>,
+        quote_end: char,
+    ) -> Result<(Cow<'a, str>, Option<char>), TokenizerError> {
+        let content_start = chars.byte_pos;
+        let mut has_doubled_quotes = false;
         let mut last_char = None;
-        let mut s = String::new();
+
+        // Scan to find the end and detect doubled quotes
         while let Some(ch) = chars.next() {
             if ch == quote_end {
                 if chars.peek() == Some(&quote_end) {
-                    chars.next();
-                    s.push(ch);
-                    if !self.unescape {
-                        // In no-escape mode, the given query has to be saved completely
-                        s.push(ch);
-                    }
+                    has_doubled_quotes = true;
+                    chars.next(); // consume the second quote
                 } else {
                     last_char = Some(quote_end);
                     break;
                 }
-            } else {
-                s.push(ch);
             }
         }
-        (s, last_char)
+
+        let content_end = if last_char.is_some() {
+            chars.byte_pos - 1 // exclude the closing quote
+        } else {
+            chars.byte_pos
+        };
+
+        let content =
+            self.safe_slice(chars.source, content_start, content_end, chars.location())?;
+
+        // If no doubled quotes, we can always borrow
+        if !has_doubled_quotes {
+            return Ok((Cow::Borrowed(content), last_char));
+        }
+
+        // If unescape=false, keep the content as-is (with doubled quotes)
+        if !self.unescape {
+            return Ok((Cow::Borrowed(content), last_char));
+        }
+
+        // Need to unescape: process doubled quotes
+        let mut result = String::new();
+        let mut chars_iter = content.chars();
+
+        while let Some(ch) = chars_iter.next() {
+            result.push(ch);
+            if ch == quote_end {
+                // This is the first of a doubled quote, skip the second one
+                chars_iter.next();
+            }
+        }
+
+        Ok((Cow::Owned(result), last_char))
     }
 
     #[allow(clippy::unnecessary_wraps)]
@@ -2304,7 +2449,78 @@
 }
 
 fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
-    Unescape::new(chars).unescape()
+    borrow_or_unescape_single_quoted_string(chars, true).map(|cow| cow.into_owned())
+}
+
+/// Scans a single-quoted string and returns either a borrowed slice or an unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return [Cow::Borrowed]
+/// - If escapes exist and unescape=true, reprocess using existing [Unescape] logic
+fn borrow_or_unescape_single_quoted_string<'a>(
+    chars: &mut State<'a>,
+    unescape: bool,
+) -> Option<Cow<'a, str>> {
+    let content_start = chars.byte_pos;
+
+    // Validate we're at an opening quote before consuming
+    if chars.peek() != Some(&'\'') {
+        return None;
+    }
+    chars.next(); // consume opening '
+
+    // Scan to find end and check for escape sequences
+    let mut has_escapes = false;
+
+    loop {
+        match chars.next() {
+            Some('\'') => {
+                // Check for doubled single quote (escape)
+                if chars.peek() == Some(&'\'') {
+                    has_escapes = true;
+                    chars.next(); // consume the second '
+                } else {
+                    // End of string found (including closing ')
+                    let content_end = chars.byte_pos;
+                    let full_content = &chars.source[content_start..content_end];
+
+                    // If no unescaping needed, return borrowed (without quotes)
+                    if !unescape || !has_escapes {
+                        // Strip opening and closing quotes
+                        // Safety: full_content includes opening and closing quotes (at least 2 chars)
+                        if full_content.len() < 2 {
+                            return None;
+                        }
+                        return Some(Cow::Borrowed(&full_content[1..full_content.len() - 1]));
+                    }
+
+                    // Need to unescape - reprocess using existing logic
+                    // Create a temporary State from the content
+                    let mut temp_state = State {
+                        peekable: full_content.chars().peekable(),
+                        source: full_content,
+                        line: 0,
+                        col: 0,
+                        byte_pos: 0,
+                    };
+
+                    return Unescape::new(&mut temp_state).unescape().map(Cow::Owned);
+                }
+            }
+            Some('\\') => {
+                has_escapes = true;
+                // Skip next character (it's escaped)
+                chars.next();
+            }
+            Some(_) => {
+                // Regular character, continue scanning
+            }
+            None => {
+                // Unexpected EOF
+                return None;
+            }
+        }
+    }
 }
 
 struct Unescape<'a: 'b, 'b> {
@@ -2452,8 +2668,98 @@
 }
 
 fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
+    borrow_or_unescape_unicode_single_quoted_string(chars, true).map(|cow| cow.into_owned())
+}
+
+/// Scans a unicode-escaped single-quoted string and returns either a borrowed slice or an unescaped owned string.
+///
+/// Strategy: Scan once to find the end and detect escape sequences.
+/// - If no escapes exist (or unescape=false), return [Cow::Borrowed]
+/// - If escapes exist and unescape=true, reprocess with unicode escaping logic
+fn borrow_or_unescape_unicode_single_quoted_string<'a>(
+    chars: &mut State<'a>,
+    unescape: bool,
+) -> Result<Cow<'a, str>, TokenizerError> {
+    let content_start = chars.byte_pos;
+    let error_loc = chars.location();
+
+    // Validate we're at an opening quote before consuming
+    if chars.peek() != Some(&'\'') {
+        return Err(TokenizerError {
+            message: "Expected opening quote for unicode string literal".to_string(),
+            location: error_loc,
+        });
+    }
+    chars.next(); // consume the opening quote
+
+    // Scan to find end and check for escape sequences
+    let mut has_escapes = false;
+
+    loop {
+        match chars.next() {
+            Some('\'') => {
+                // Check for doubled single quote (escape)
+                if chars.peek() == Some(&'\'') {
+                    has_escapes = true;
+                    chars.next(); // consume the second '
+                } else {
+                    // End of string found (including closing ')
+                    let content_end = chars.byte_pos;
+                    let full_content = &chars.source[content_start..content_end];
+
+                    // If no unescaping needed, return borrowed (without quotes)
+                    if !unescape || !has_escapes {
+                        // Strip opening and closing quotes
+                        // Safety: full_content includes opening and closing quotes (at least 2 chars)
+                        if full_content.len() < 2 {
+                            return Err(TokenizerError {
+                                message: "Invalid unicode string literal".to_string(),
+                                location: error_loc,
+                            });
+                        }
+                        return Ok(Cow::Borrowed(&full_content[1..full_content.len() - 1]));
+                    }
+
+                    // Need to unescape - reprocess with unicode logic
+                    // Create a temporary State from the content
+                    let mut temp_state = State {
+                        peekable: full_content.chars().peekable(),
+                        source: full_content,
+                        line: 0,
+                        col: 0,
+                        byte_pos: 0,
+                    };
+
+                    return process_unicode_string_with_escapes(&mut temp_state, error_loc)
+                        .map(Cow::Owned);
+                }
+            }
+            Some('\\') => {
+                has_escapes = true;
+                // Skip next character (it's escaped or part of unicode sequence)
+                chars.next();
+            }
+            Some(_) => {
+                // Regular character, continue scanning
+            }
+            None => {
+                return Err(TokenizerError {
+                    message: "Unterminated unicode encoded string literal".to_string(),
+                    location: error_loc,
+                });
+            }
+        }
+    }
+}
+
+/// Process a unicode-escaped string using the original unescape logic
+fn process_unicode_string_with_escapes(
+    chars: &mut State<'_>,
+    error_loc: Location,
+) -> Result<String, TokenizerError> {
     let mut unescaped = String::new();
     chars.next(); // consume the opening quote
+
     while let Some(c) = chars.next() {
         match c {
             '\'' => {
@@ -2480,9 +2786,10 @@
             }
         }
     }
+
     Err(TokenizerError {
         message: "Unterminated unicode encoded string literal".to_string(),
-        location: chars.location(),
+        location: error_loc,
     })
 }
commit	b098976cabdc77b881fcce27a494b7041623c809	[log] [tgz]
author	eyalleshem <38852709+eyalleshem@users.noreply.github.com>	Thu Dec 04 10:45:15 2025 +0200
committer	GitHub <noreply@github.com>	Thu Dec 04 09:45:15 2025 +0100
tree	f16154726354d7c5b10c075fa7a9190d8782310d
parent	c8acf9f52d829ce808c5d59c0a1e962e762d6dc3 [diff]