tests/unit_tests/sql/dialects/pinot_tests.py - superset - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import pytest
 import sqlglot

 from superset.sql.dialects.pinot import Pinot


 def test_pinot_dialect_registered() -> None:
     """
     Test that Pinot dialect is properly registered.
     """
     from superset.sql.parse import SQLGLOT_DIALECTS

     assert "pinot" in SQLGLOT_DIALECTS
     assert SQLGLOT_DIALECTS["pinot"] == Pinot


 def test_double_quotes_as_identifiers() -> None:
     """
     Test that double quotes are treated as identifiers, not string literals.
     """
     sql = 'SELECT "column_name" FROM "table_name"'
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "column_name"
 FROM "table_name"
         """.strip()
     )


 def test_single_quotes_for_strings() -> None:
     """
     Test that single quotes are used for string literals.
     """
     sql = "SELECT * FROM users WHERE name = 'John'"
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   *
 FROM users
 WHERE
   name = 'John'
         """.strip()
     )


 def test_backticks_as_identifiers() -> None:
     """
     Test that backticks work as identifiers (MySQL-style).
     Backticks are normalized to double quotes in output.
     """
     sql = "SELECT `column_name` FROM `table_name`"
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "column_name"
 FROM "table_name"
         """.strip()
     )


 def test_mixed_identifier_quotes() -> None:
     """
     Test mixing double quotes and backticks for identifiers.
     All identifiers are normalized to double quotes in output.
     """
     sql = (
         'SELECT "col1", `col2` FROM "table1" JOIN `table2` ON "table1".id = `table2`.id'
     )
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "col1",
   "col2"
 FROM "table1"
 JOIN "table2"
   ON "table1".id = "table2".id
         """.strip()
     )


 def test_string_with_escaped_quotes() -> None:
     """
     Test string literals with escaped single quotes.
     """
     sql = "SELECT * FROM users WHERE name = 'O''Brien'"
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   *
 FROM users
 WHERE
   name = 'O''Brien'
         """.strip()
     )


 def test_string_with_backslash_escape() -> None:
     """
     Test string literals with backslash escapes.
     """
     sql = r"SELECT * FROM users WHERE path = 'C:\\Users\\John'"
     ast = sqlglot.parse_one(sql, Pinot)

     generated = Pinot().generate(expression=ast, pretty=True)
     assert "WHERE" in generated
     assert "path" in generated


 @pytest.mark.parametrize(
     "sql, expected",
     [
         (
             'SELECT COUNT(*) FROM "events" WHERE "type" = \'click\'',
             """
 SELECT
   COUNT(*)
 FROM "events"
 WHERE
   "type" = 'click'
             """.strip(),
         ),
         (
             'SELECT "user_id", SUM("amount") FROM "transactions" GROUP BY "user_id"',
             """
 SELECT
   "user_id",
   SUM("amount")
 FROM "transactions"
 GROUP BY
   "user_id"
             """.strip(),
         ),
         (
             "SELECT * FROM \"orders\" WHERE \"status\" IN ('pending', 'shipped')",
             """
 SELECT
   *
 FROM "orders"
 WHERE
   "status" IN ('pending', 'shipped')
             """.strip(),
         ),
     ],
 )
 def test_various_queries(sql: str, expected: str) -> None:
     """
     Test various SQL queries with Pinot dialect.
     """
     ast = sqlglot.parse_one(sql, Pinot)
     assert Pinot().generate(expression=ast, pretty=True) == expected


 def test_aggregate_functions() -> None:
     """
     Test aggregate functions with quoted identifiers.
     """
     sql = """
 SELECT
     "category",
     COUNT(*),
     AVG("price"),
     MAX("quantity")
 FROM "products"
 GROUP BY "category"
     """
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "category",
   COUNT(*),
   AVG("price"),
   MAX("quantity")
 FROM "products"
 GROUP BY
   "category"
         """.strip()
     )


 def test_join_with_quoted_identifiers() -> None:
     """
     Test JOIN operations with double-quoted identifiers.
     """
     sql = """
     SELECT "u"."name", "o"."total"
     FROM "users" AS "u"
     JOIN "orders" AS "o" ON "u"."id" = "o"."user_id"
     """
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "u"."name",
   "o"."total"
 FROM "users" AS "u"
 JOIN "orders" AS "o"
   ON "u"."id" = "o"."user_id"
         """.strip()
     )


 def test_subquery_with_quoted_identifiers() -> None:
     """
     Test subqueries with double-quoted identifiers.
     """
     sql = 'SELECT * FROM (SELECT "id", "name" FROM "users") AS "subquery"'
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   *
 FROM (
   SELECT
     "id",
     "name"
   FROM "users"
 ) AS "subquery"
         """.strip()
     )


 def test_case_expression() -> None:
     """
     Test CASE expressions with quoted identifiers.
     """
     sql = """
     SELECT "name",
            CASE WHEN "age" < 18 THEN 'minor'
                 WHEN "age" >= 18 THEN 'adult'
            END AS "category"
     FROM "persons"
     """
     ast = sqlglot.parse_one(sql, Pinot)

     generated = Pinot().generate(expression=ast, pretty=True)
     assert '"name"' in generated
     assert '"age"' in generated
     assert '"category"' in generated
     assert "'minor'" in generated
     assert "'adult'" in generated


 def test_cte_with_quoted_identifiers() -> None:
     """
     Test Common Table Expressions (CTE) with quoted identifiers.
     """
     sql = """
     WITH "high_value_orders" AS (
         SELECT * FROM "orders" WHERE "total" > 1000
     )
     SELECT "customer_id", COUNT(*) FROM "high_value_orders" GROUP BY "customer_id"
     """
     ast = sqlglot.parse_one(sql, Pinot)

     generated = Pinot().generate(expression=ast, pretty=True)
     assert 'WITH "high_value_orders" AS' in generated
     assert '"orders"' in generated
     assert '"total"' in generated
     assert '"customer_id"' in generated


 def test_order_by_with_quoted_identifiers() -> None:
     """
     Test ORDER BY clause with quoted identifiers.
     SQLGlot explicitly includes ASC in the output.
     """
     sql = 'SELECT "name", "salary" FROM "employees" ORDER BY "salary" DESC, "name" ASC'
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT
   "name",
   "salary"
 FROM "employees"
 ORDER BY
   "salary" DESC,
   "name" ASC
         """.strip()
     )


 def test_limit_and_offset() -> None:
     """
     Test LIMIT and OFFSET clauses.
     """
     sql = 'SELECT * FROM "products" LIMIT 10 OFFSET 20'
     ast = sqlglot.parse_one(sql, Pinot)

     generated = Pinot().generate(expression=ast, pretty=True)
     assert '"products"' in generated
     assert "LIMIT 10" in generated


 def test_distinct() -> None:
     """
     Test DISTINCT keyword with quoted identifiers.
     """
     sql = 'SELECT DISTINCT "category" FROM "products"'
     ast = sqlglot.parse_one(sql, Pinot)

     assert (
         Pinot().generate(expression=ast, pretty=True)
         == """
 SELECT DISTINCT
   "category"
 FROM "products"
         """.strip()
     )


 def test_cast_to_string() -> None:
     """
     Test that CAST to STRING is preserved (not converted to CHAR).
     """
     sql = "SELECT CAST(cohort_size AS STRING) FROM table"
     ast = sqlglot.parse_one(sql, Pinot)
     generated = Pinot().generate(expression=ast)

     assert "STRING" in generated
     assert "CHAR" not in generated


 def test_concat_with_cast_string() -> None:
     """
     Test CONCAT with CAST to STRING - verifies the original issue is fixed.
     """
     sql = """
 SELECT concat(a, cast(b AS string), ' - ')
 FROM "default".c"""
     ast = sqlglot.parse_one(sql, Pinot)
     generated = Pinot().generate(expression=ast)

     # Verify STRING type is preserved (not converted to CHAR)
     assert "STRING" in generated or "string" in generated.lower()
     assert "CHAR" not in generated


 @pytest.mark.parametrize(
     "cast_type, expected_type",
     [
         ("INT", "INT"),
         ("TINYINT", "INT"),
         ("SMALLINT", "INT"),
         ("BIGINT", "LONG"),
         ("LONG", "LONG"),
         ("FLOAT", "FLOAT"),
         ("DOUBLE", "DOUBLE"),
         ("BOOLEAN", "BOOLEAN"),
         ("TIMESTAMP", "TIMESTAMP"),
         ("STRING", "STRING"),
         ("VARCHAR", "STRING"),
         ("CHAR", "STRING"),
         ("TEXT", "STRING"),
         ("BYTES", "BYTES"),
         ("BINARY", "BYTES"),
         ("VARBINARY", "BYTES"),
         ("JSON", "JSON"),
     ],
 )
 def test_type_mappings(cast_type: str, expected_type: str) -> None:
     """
     Test that Pinot type mappings work correctly for all basic types.
     """
     sql = f"SELECT CAST(col AS {cast_type}) FROM table"  # noqa: S608
     ast = sqlglot.parse_one(sql, Pinot)
     generated = Pinot().generate(expression=ast)

     assert expected_type in generated


 def test_unsigned_type() -> None:
     """
     Test that unsigned integer types are handled correctly.
     Tests the UNSIGNED_TYPE_MAPPING path in datatype_sql method.
     """
     from sqlglot import exp

     # Create a UBIGINT DataType which is in UNSIGNED_TYPE_MAPPING
     dt = exp.DataType(this=exp.DataType.Type.UBIGINT)
     result = Pinot.Generator().datatype_sql(dt)

     assert "UNSIGNED" in result
     assert "BIGINT" in result


 def test_date_trunc_preserved() -> None:
     """
     Test that DATE_TRUNC is preserved and not converted to MySQL's DATE() function.
     """
     sql = "SELECT DATE_TRUNC('day', dt_column) FROM table"
     result = sqlglot.parse_one(sql, Pinot).sql(Pinot)

     assert "DATE_TRUNC" in result
     assert "date_trunc('day'" in result.lower()
     # Should not be converted to MySQL's DATE() function
     assert result != "SELECT DATE(dt_column) FROM table"


 def test_cast_timestamp_preserved() -> None:
     """
     Test that CAST AS TIMESTAMP is preserved and not converted to TIMESTAMP() function.
     """
     sql = "SELECT CAST(dt_column AS TIMESTAMP) FROM table"
     result = sqlglot.parse_one(sql, Pinot).sql(Pinot)

     assert "CAST" in result
     assert "AS TIMESTAMP" in result
     # Should not be converted to MySQL's TIMESTAMP() function
     assert "TIMESTAMP(dt_column)" not in result


 def test_date_trunc_with_cast_timestamp() -> None:
     """
     Test the original complex query with DATE_TRUNC and CAST AS TIMESTAMP.
     Verifies that both are preserved in parse/generate round-trip.
     """
     sql = """
 SELECT
   CAST(
     DATE_TRUNC(
       'day',
       CAST(
         DATETIMECONVERT(
           dt_epoch_ms, '1:MILLISECONDS:EPOCH',
           '1:MILLISECONDS:EPOCH', '1:MILLISECONDS'
         ) AS TIMESTAMP
       )
     ) AS TIMESTAMP
   ),
   SUM(a) + SUM(b)
 FROM
   "default".c
 WHERE
   dt_epoch_ms >= 1735690800000
   AND dt_epoch_ms < 1759328588000
   AND locality != 'US'
 GROUP BY
   CAST(
     DATE_TRUNC(
       'day',
       CAST(
         DATETIMECONVERT(
           dt_epoch_ms, '1:MILLISECONDS:EPOCH',
           '1:MILLISECONDS:EPOCH', '1:MILLISECONDS'
         ) AS TIMESTAMP
       )
     ) AS TIMESTAMP
   )
 LIMIT
   10000
     """
     result = sqlglot.parse_one(sql, Pinot).sql(Pinot)

     # Verify DATE_TRUNC and CAST are preserved
     assert "DATE_TRUNC" in result
     assert "CAST" in result

     # Verify these are NOT converted to MySQL functions
     assert "TIMESTAMP(DATETIMECONVERT" not in result
     assert result.count("DATE_TRUNC") == 2  # Should appear twice (SELECT and GROUP BY)


 def test_pinot_date_add_parsing() -> None:
     """
     Test that Pinot's DATE_ADD function with Presto-like syntax can be parsed.
     """
     from superset.sql.parse import SQLScript

     sql = """
 SELECT dt_epoch_ms FROM my_table WHERE dt_epoch_ms >= date_add('day', -180, now())
     """
     script = SQLScript(sql, "pinot")
     assert len(script.statements) == 1
     assert not script.has_mutation()


 def test_pinot_date_add_simple() -> None:
     """
     Test parsing of simple DATE_ADD expressions.
     """
     test_cases = [
         "date_add('day', -180, now())",
         "DATE_ADD('month', 5, current_timestamp())",
         "date_add('year', 1, my_date_column)",
     ]

     for sql in test_cases:
         parsed = sqlglot.parse_one(sql, Pinot)
         assert parsed is not None
         # Verify that it generates valid SQL
         generated = parsed.sql(dialect=Pinot)
         assert "DATE_ADD" in generated.upper()


 def test_pinot_date_add_unit_quoted() -> None:
     """
     Test that DATE_ADD preserves quotes around the unit argument.

     Pinot requires the unit to be a quoted string, not an identifier.
     """
     sql = "dt_epoch_ms >= date_add('day', -180, now())"
     result = sqlglot.parse_one(sql, Pinot).sql(Pinot)

     # The unit should be quoted: 'DAY' not DAY
     assert "DATE_ADD('DAY', -180, NOW())" in result
     assert "DATE_ADD(DAY," not in result


 def test_pinot_date_sub_parsing() -> None:
     """
     Test that Pinot's DATE_SUB function with Presto-like syntax can be parsed.
     """
     from superset.sql.parse import SQLScript

     sql = "SELECT * FROM my_table WHERE dt >= date_sub('day', 7, now())"
     script = SQLScript(sql, "pinot")
     assert len(script.statements) == 1
     assert not script.has_mutation()


 def test_pinot_date_sub_simple() -> None:
     """
     Test parsing of simple DATE_SUB expressions.
     """
     test_cases = [
         "date_sub('day', 7, now())",
         "DATE_SUB('month', 3, current_timestamp())",
         "date_sub('hour', 24, my_date_column)",
     ]

     for sql in test_cases:
         parsed = sqlglot.parse_one(sql, Pinot)
         assert parsed is not None
         # Verify that it generates valid SQL
         generated = parsed.sql(dialect=Pinot)
         assert "DATE_SUB" in generated.upper()


 def test_pinot_date_sub_unit_quoted() -> None:
     """
     Test that DATE_SUB preserves quotes around the unit argument.

     Pinot requires the unit to be a quoted string, not an identifier.
     """
     sql = "dt_epoch_ms >= date_sub('day', -180, now())"
     result = sqlglot.parse_one(sql, Pinot).sql(Pinot)

     # The unit should be quoted: 'DAY' not DAY
     assert "DATE_SUB('DAY', -180, NOW())" in result
     assert "DATE_SUB(DAY," not in result


 def test_substr_cross_dialect_generation() -> None:
     """
     Test that SUBSTR is preserved when generating Pinot SQL.

     Note that the MySQL dialect (in which Pinot is based) uses SUBSTRING instead of
     SUBSTR.
     """
     # Parse with Pinot dialect
     pinot_sql = "SELECT SUBSTR('hello', 0, 3) FROM users"
     parsed = sqlglot.parse_one(pinot_sql, Pinot)

     # Generate back to Pinot → should preserve SUBSTR
     pinot_output = parsed.sql(dialect=Pinot)
     assert "SUBSTR(" in pinot_output
     assert "SUBSTRING(" not in pinot_output

     # Generate to MySQL → should convert to SUBSTRING
     mysql_output = parsed.sql(dialect="mysql")
     assert "SUBSTRING(" in mysql_output
     assert pinot_output != mysql_output  # They should be different


 @pytest.mark.parametrize(
     "function_name,sample_args",
     [
         # Math functions
         ("ABS", "-5"),
         ("CEIL", "3.14"),
         ("FLOOR", "3.14"),
         ("EXP", "2"),
         ("LN", "10"),
         ("SQRT", "16"),
         ("ROUNDDECIMAL", "3.14159, 2"),
         ("ADD", "1, 2, 3"),
         ("SUB", "10, 3"),
         ("MULT", "5, 4"),
         ("MOD", "10, 3"),
         # String functions
         ("UPPER", "'hello'"),
         ("LOWER", "'HELLO'"),
         ("REVERSE", "'hello'"),
         ("SUBSTR", "'hello', 0, 3"),
         ("CONCAT", "'hello', ' ', 'world'"),
         ("TRIM", "' hello '"),
         ("LTRIM", "' hello'"),
         ("RTRIM", "'hello '"),
         ("LENGTH", "'hello'"),
         ("STRPOS", "'hello', 'l', 1"),
         ("STARTSWITH", "'hello', 'he'"),
         ("REPLACE", "'hello', 'l', 'r'"),
         ("RPAD", "'hello', 10, 'x'"),
         ("LPAD", "'hello', 10, 'x'"),
         ("CODEPOINT", "'A'"),
         ("CHR", "65"),
         ("regexpExtract", "'foo123bar', '[0-9]+'"),
         ("regexpReplace", "'hello', 'l', 'r'"),
         ("remove", "'hello', 'l'"),
         ("urlEncoding", "'hello world'"),
         ("urlDecoding", "'hello%20world'"),
         ("fromBase64", "'aGVsbG8='"),
         ("toUtf8", "'hello'"),
         ("isSubnetOf", "'192.168.1.1', '192.168.0.0/16'"),
         # DateTime functions
         ("DATETRUNC", "'day', timestamp_col"),
         ("DATETIMECONVERT", "dt_col, '1:HOURS:EPOCH', '1:DAYS:EPOCH', '1:DAYS'"),
         ("TIMECONVERT", "timestamp_col, 'MILLISECONDS', 'SECONDS'"),
         ("NOW", ""),
         ("AGO", "'P1D'"),
         ("YEAR", "timestamp_col"),
         ("QUARTER", "timestamp_col"),
         ("MONTH", "timestamp_col"),
         ("WEEK", "timestamp_col"),
         ("DAY", "timestamp_col"),
         ("HOUR", "timestamp_col"),
         ("MINUTE", "timestamp_col"),
         ("SECOND", "timestamp_col"),
         ("MILLISECOND", "timestamp_col"),
         ("DAYOFWEEK", "timestamp_col"),
         ("DAYOFYEAR", "timestamp_col"),
         ("YEAROFWEEK", "timestamp_col"),
         ("toEpochSeconds", "timestamp_col"),
         ("toEpochMinutes", "timestamp_col"),
         ("toEpochHours", "timestamp_col"),
         ("toEpochDays", "timestamp_col"),
         ("fromEpochSeconds", "1234567890"),
         ("fromEpochMinutes", "20576131"),
         ("fromEpochHours", "342935"),
         ("fromEpochDays", "14288"),
         ("toDateTime", "timestamp_col, 'yyyy-MM-dd'"),
         ("fromDateTime", "'2024-01-01', 'yyyy-MM-dd'"),
         ("timezoneHour", "timestamp_col"),
         ("timezoneMinute", "timestamp_col"),
         ("DATE_ADD", "'day', 7, NOW()"),
         ("DATE_SUB", "'day', 7, NOW()"),
         ("TIMESTAMPADD", "'day', 7, timestamp_col"),
         ("TIMESTAMPDIFF", "'day', timestamp1, timestamp2"),
         ("dateTrunc", "'day', timestamp_col"),
         ("dateDiff", "'day', timestamp1, timestamp2"),
         ("dateAdd", "'day', 7, timestamp_col"),
         ("dateBin", "'day', timestamp_col, NOW()"),
         ("toIso8601", "timestamp_col"),
         ("fromIso8601", "'2024-01-01T00:00:00Z'"),
         # Aggregation functions
         ("COUNT", "*"),
         ("SUM", "amount"),
         ("AVG", "value"),
         ("MIN", "value"),
         ("MAX", "value"),
         ("DISTINCTCOUNT", "user_id"),
         ("DISTINCTCOUNTBITMAP", "user_id"),
         ("DISTINCTCOUNTHLL", "user_id"),
         ("DISTINCTCOUNTRAWHLL", "user_id"),
         ("DISTINCTCOUNTHLLPLUS", "user_id"),
         ("DISTINCTCOUNTRAWHLLPLUS", "user_id"),
         ("DISTINCTCOUNTSMARTHLL", "user_id"),
         ("DISTINCTCOUNTCPCSKETCH", "user_id"),
         ("DISTINCTCOUNTRAWCPCSKETCH", "user_id"),
         ("DISTINCTCOUNTTHETASKETCH", "user_id"),
         ("DISTINCTCOUNTRAWTHETASKETCH", "user_id"),
         ("DISTINCTCOUNTTUPLESKETCH", "user_id"),
         ("DISTINCTCOUNTRAWINTEGERSUMTUPLESKETCH", "user_id"),
         ("DISTINCTCOUNTULL", "user_id"),
         ("DISTINCTCOUNTRAWULL", "user_id"),
         ("SEGMENTPARTITIONEDDISTINCTCOUNT", "user_id"),
         ("SUMVALUESINTEGERSUMTUPLESKETCH", "value"),
         ("PERCENTILE", "value, 95"),
         ("PERCENTILEEST", "value, 95"),
         ("PERCENTILETDIGEST", "value, 95"),
         ("PERCENTILESMARTTDIGEST", "value, 95"),
         ("PERCENTILEKLL", "value, 95"),
         ("PERCENTILEKLLRAW", "value, 95"),
         ("HISTOGRAM", "value, 10"),
         ("MODE", "category"),
         ("MINMAXRANGE", "value"),
         ("SUMPRECISION", "value, 10"),
         ("ARG_MIN", "value, id"),
         ("ARG_MAX", "value, id"),
         ("COVAR_POP", "x, y"),
         ("COVAR_SAMP", "x, y"),
         ("LASTWITHTIME", "value, timestamp_col, 'LONG'"),
         ("FIRSTWITHTIME", "value, timestamp_col, 'LONG'"),
         ("ARRAY_AGG", "value"),
         # Multi-value functions
         ("COUNTMV", "tags"),
         ("MAXMV", "scores"),
         ("MINMV", "scores"),
         ("SUMMV", "scores"),
         ("AVGMV", "scores"),
         ("MINMAXRANGEMV", "scores"),
         ("PERCENTILEMV", "scores, 95"),
         ("PERCENTILEESTMV", "scores, 95"),
         ("PERCENTILETDIGESTMV", "scores, 95"),
         ("PERCENTILEKLLMV", "scores, 95"),
         ("DISTINCTCOUNTMV", "tags"),
         ("DISTINCTCOUNTBITMAPMV", "tags"),
         ("DISTINCTCOUNTHLLMV", "tags"),
         ("DISTINCTCOUNTRAWHLLMV", "tags"),
         ("DISTINCTCOUNTHLLPLUSMV", "tags"),
         ("DISTINCTCOUNTRAWHLLPLUSMV", "tags"),
         ("ARRAYLENGTH", "array_col"),
         ("MAP_VALUE", "map_col, 'key'"),
         ("VALUEIN", "value, 'val1', 'val2'"),
         # JSON functions
         ("JSONEXTRACTSCALAR", "json_col, '$.name', 'STRING'"),
         ("JSONEXTRACTKEY", "json_col, '$.data'"),
         ("JSONFORMAT", "json_col"),
         ("JSONPATH", "json_col, '$.name'"),
         ("JSONPATHLONG", "json_col, '$.id'"),
         ("JSONPATHDOUBLE", "json_col, '$.price'"),
         ("JSONPATHSTRING", "json_col, '$.name'"),
         ("JSONPATHARRAY", "json_col, '$.items'"),
         ("JSONPATHARRAYDEFAULTEMPTY", "json_col, '$.items'"),
         ("TOJSONMAPSTR", "map_col"),
         ("JSON_MATCH", "json_col, '\"$.name\"=''value'''"),
         ("JSON_EXTRACT_SCALAR", "json_col, '$.name', 'STRING'"),
         # Array functions
         ("arrayReverseInt", "int_array"),
         ("arrayReverseString", "string_array"),
         ("arraySortInt", "int_array"),
         ("arraySortString", "string_array"),
         ("arrayIndexOfInt", "int_array, 5"),
         ("arrayIndexOfString", "string_array, 'value'"),
         ("arrayContainsInt", "int_array, 5"),
         ("arrayContainsString", "string_array, 'value'"),
         ("arraySliceInt", "int_array, 0, 3"),
         ("arraySliceString", "string_array, 0, 3"),
         ("arrayDistinctInt", "int_array"),
         ("arrayDistinctString", "string_array"),
         ("arrayRemoveInt", "int_array, 5"),
         ("arrayRemoveString", "string_array, 'value'"),
         ("arrayUnionInt", "int_array1, int_array2"),
         ("arrayUnionString", "string_array1, string_array2"),
         ("arrayConcatInt", "int_array1, int_array2"),
         ("arrayConcatString", "string_array1, string_array2"),
         ("arrayElementAtInt", "int_array, 0"),
         ("arrayElementAtString", "string_array, 0"),
         ("arraySumInt", "int_array"),
         ("arrayValueConstructor", "1, 2, 3"),
         ("arrayToString", "array_col, ','"),
         # Geospatial functions
         ("ST_DISTANCE", "point1, point2"),
         ("ST_CONTAINS", "polygon, point"),
         ("ST_AREA", "polygon"),
         ("ST_GEOMFROMTEXT", "'POINT(1 2)'"),
         ("ST_GEOMFROMWKB", "wkb_col"),
         ("ST_GEOGFROMWKB", "wkb_col"),
         ("ST_GEOGFROMTEXT", "'POINT(1 2)'"),
         ("ST_POINT", "1.0, 2.0"),
         ("ST_POLYGON", "'POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'"),
         ("ST_ASBINARY", "geom_col"),
         ("ST_ASTEXT", "geom_col"),
         ("ST_GEOMETRYTYPE", "geom_col"),
         ("ST_EQUALS", "geom1, geom2"),
         ("ST_WITHIN", "geom1, geom2"),
         ("ST_UNION", "geom1, geom2"),
         ("ST_GEOMFROMGEOJSON", '\'{"type":"Point","coordinates":[1,2]}\''),
         ("ST_GEOGFROMGEOJSON", '\'{"type":"Point","coordinates":[1,2]}\''),
         ("ST_ASGEOJSON", "geom_col"),
         ("toSphericalGeography", "geom_col"),
         ("toGeometry", "geog_col"),
         # Binary/Hash functions
         ("SHA", "'hello'"),
         ("SHA256", "'hello'"),
         ("SHA512", "'hello'"),
         ("SHA224", "'hello'"),
         ("MD5", "'hello'"),
         ("MD2", "'hello'"),
         ("toBase64", "'hello'"),
         ("fromUtf8", "bytes_col"),
         ("MurmurHash2", "'hello'"),
         ("MurmurHash3Bit32", "'hello'"),
         # Window functions
         ("ROW_NUMBER", ""),
         ("RANK", ""),
         ("DENSE_RANK", ""),
         # Funnel analysis
         ("FunnelMaxStep", "event_col, 'step1', 'step2', 'step3'"),
         ("FunnelMatchStep", "event_col, 'step1', 'step2', 'step3'"),
         ("FunnelCompleteCount", "event_col, 'step1', 'step2', 'step3'"),
         # Text search
         ("TEXT_MATCH", "text_col, 'search query'"),
         # Vector functions
         ("VECTOR_SIMILARITY", "vector1, vector2"),
         ("l2_distance", "vector1, vector2"),
         # Lookup
         ("LOOKUP", "'lookupTable', 'lookupColumn', 'keyColumn', keyValue"),
         # URL functions
         ("urlProtocol", "'https://example.com/path'"),
         ("urlDomain", "'https://example.com/path'"),
         ("urlPath", "'https://example.com/path'"),
         ("urlPort", "'https://example.com:8080/path'"),
         ("urlEncode", "'hello world'"),
         ("urlDecode", "'hello%20world'"),
         # Conditional
         ("COALESCE", "val1, val2, 'default'"),
         ("NULLIF", "val1, val2"),
         ("GREATEST", "1, 2, 3"),
         ("LEAST", "1, 2, 3"),
         # Other
         ("REGEXP_LIKE", "'hello', 'h.*'"),
         ("GROOVY", "'{return arg0 + arg1}', col1, col2"),
     ],
 )
 def test_pinot_function_names_preserved(function_name: str, sample_args: str) -> None:
     """
     Test that Pinot function names are preserved during parse/generate roundtrip.

     This ensures that when we parse Pinot SQL and generate it back, the function
     names remain unchanged. This is critical for maintaining compatibility with
     Pinot's function library.
     """
     # Special handling for window functions
     if function_name in ["ROW_NUMBER", "RANK", "DENSE_RANK"]:
         sql = f"SELECT {function_name}() OVER (ORDER BY col) FROM table"  # noqa: S608
     else:
         sql = f"SELECT {function_name}({sample_args}) FROM table"  # noqa: S608

     # Parse with Pinot dialect
     parsed = sqlglot.parse_one(sql, Pinot)

     # Generate back to Pinot
     generated = parsed.sql(dialect=Pinot)

     # The function name should be preserved (case-insensitive check)
     assert function_name.upper() in generated.upper(), (
         f"Function {function_name} not preserved in output: {generated}"
     )