feat: Impala dialect for sqlglot
diff --git a/superset/sql/dialects/__init__.py b/superset/sql/dialects/__init__.py
index 3b43b15..22a1716 100644
--- a/superset/sql/dialects/__init__.py
+++ b/superset/sql/dialects/__init__.py
@@ -17,5 +17,6 @@
from .dremio import Dremio
from .firebolt import Firebolt, FireboltOld
+from .impala import Impala
-__all__ = ["Dremio", "Firebolt", "FireboltOld"]
+__all__ = ["Dremio", "Firebolt", "FireboltOld", "Impala"]
diff --git a/superset/sql/dialects/impala.py b/superset/sql/dialects/impala.py
new file mode 100644
index 0000000..0c38ce6
--- /dev/null
+++ b/superset/sql/dialects/impala.py
@@ -0,0 +1,259 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import typing as t
+
+from sqlglot import exp, generator, parser
+from sqlglot.dialects.hive import Hive
+from sqlglot.helper import seq_get
+
+
+class Impala(Hive):
+ """
+ A sqlglot dialect for Impala.
+
+ Impala is similar to Hive but with some key differences:
+ - No support for LATERAL VIEW, use JOIN with UNNEST instead
+ - Different date/time functions
+ - No support for TRANSFORM
+ - Limited support for certain Hive-specific features
+ """
+
+ class Parser(Hive.Parser):
+ FUNCTIONS = {
+ **Hive.Parser.FUNCTIONS,
+ # Impala-specific functions
+ "MONTHS_ADD": lambda args: exp.DateAdd(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("MONTH"),
+ ),
+ "MONTHS_SUB": lambda args: exp.DateSub(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("MONTH"),
+ ),
+ "YEARS_ADD": lambda args: exp.DateAdd(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("YEAR"),
+ ),
+ "YEARS_SUB": lambda args: exp.DateSub(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("YEAR"),
+ ),
+ "DAYS_ADD": lambda args: exp.DateAdd(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("DAY"),
+ ),
+ "DAYS_SUB": lambda args: exp.DateSub(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("DAY"),
+ ),
+ "WEEKS_ADD": lambda args: exp.DateAdd(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("WEEK"),
+ ),
+ "WEEKS_SUB": lambda args: exp.DateSub(
+ this=seq_get(args, 0),
+ expression=seq_get(args, 1),
+ unit=exp.Literal.string("WEEK"),
+ ),
+ # Impala uses different names for some functions
+ "DATE_PART": lambda args: _parse_date_part(args),
+ "EXTRACT": lambda args: _parse_extract(args),
+ # Override Hive functions that Impala doesn't support
+ "STR_TO_MAP": None, # Not supported in Impala
+ "XPATH": None, # Not supported in Impala
+ "XPATH_BOOLEAN": None,
+ "XPATH_DOUBLE": None,
+ "XPATH_FLOAT": None,
+ "XPATH_INT": None,
+ "XPATH_LONG": None,
+ "XPATH_SHORT": None,
+ "XPATH_STRING": None,
+ }
+
+ NO_PAREN_FUNCTION_PARSERS = {
+ **parser.Parser.NO_PAREN_FUNCTION_PARSERS,
+ # Remove TRANSFORM as it's not supported in Impala
+ }
+ NO_PAREN_FUNCTION_PARSERS.pop("TRANSFORM", None)
+
+ def _parse_lateral(self) -> t.Optional[exp.Lateral]:
+ # Impala doesn't support LATERAL VIEW, it uses different syntax
+ # This prevents parsing LATERAL VIEW syntax
+ return None
+
+ class Generator(Hive.Generator):
+ # Impala-specific type mappings
+ TYPE_MAPPING = {
+ **Hive.Generator.TYPE_MAPPING,
+ exp.DataType.Type.VARCHAR: "STRING", # Impala treats VARCHAR as STRING
+ exp.DataType.Type.NVARCHAR: "STRING",
+ exp.DataType.Type.CHAR: "STRING", # Impala treats CHAR as STRING
+ exp.DataType.Type.NCHAR: "STRING",
+ }
+
+ TRANSFORMS = {
+ **Hive.Generator.TRANSFORMS,
+ # Date/time functions
+ exp.DateAdd: lambda self, e: _date_add_sql(self, e),
+ exp.DateSub: lambda self, e: _date_sub_sql(self, e),
+ # Impala doesn't support certain Hive features
+ exp.StrToMap: lambda self, e: self.unsupported(
+ "STR_TO_MAP is not supported in Impala"
+ ),
+ exp.Transform: lambda self, e: self.unsupported(
+ "TRANSFORM is not supported in Impala"
+ ),
+ exp.QueryTransform: lambda self, e: self.unsupported(
+ "TRANSFORM is not supported in Impala"
+ ),
+ # Override LATERAL VIEW handling
+ exp.Lateral: lambda self, e: _lateral_sql(self, e),
+ # JSON functions have different names in Impala
+ exp.JSONExtract: lambda self, e: self.func(
+ "JSON_QUERY", e.this, e.expression
+ ),
+ exp.JSONExtractScalar: lambda self, e: self.func(
+ "JSON_VALUE", e.this, e.expression
+ ),
+ # Impala uses different syntax for COLLECT_LIST/SET
+ exp.ArrayAgg: lambda self, e: self.func(
+ "GROUP_CONCAT",
+ e.this.this if isinstance(e.this, exp.Order) else e.this,
+ exp.Literal.string(","),
+ ),
+ exp.ArrayUniqueAgg: lambda self, e: self.func(
+ "GROUP_CONCAT",
+ self.sql(exp.Distinct(expressions=[e.this])),
+ exp.Literal.string(","),
+ ),
+ }
+
+ def datatype_sql(self, expression: exp.DataType) -> str:
+ # Impala treats CHAR/VARCHAR as STRING
+ if expression.is_type("char", "varchar", "nchar", "nvarchar"):
+ return "STRING"
+
+ return super().datatype_sql(expression)
+
+ def lateral_sql(self, expression: exp.Lateral) -> str:
+ # Impala doesn't use LATERAL VIEW syntax
+ # Instead, it uses regular JOIN with UNNEST
+ if isinstance(expression.this, exp.Unnest):
+ return self.sql(expression.this)
+ return super().lateral_sql(expression)
+
+
+def _parse_date_part(args: t.List[exp.Expression]) -> exp.Expression:
+ """Parse DATE_PART function which extracts date parts."""
+ part = seq_get(args, 0)
+ date = seq_get(args, 1)
+
+ if isinstance(part, exp.Literal):
+ part_name = part.name.upper()
+ if part_name == "YEAR":
+ return exp.Year(this=date)
+ elif part_name == "MONTH":
+ return exp.Month(this=date)
+ elif part_name == "DAY":
+ return exp.Day(this=date)
+ elif part_name == "HOUR":
+ return exp.Hour(this=date)
+ elif part_name == "MINUTE":
+ return exp.Minute(this=date)
+ elif part_name == "SECOND":
+ return exp.Second(this=date)
+
+ return exp.Extract(this=part, expression=date)
+
+
+def _parse_extract(args: t.List[exp.Expression]) -> exp.Expression:
+ """Parse EXTRACT function."""
+ return exp.Extract(this=seq_get(args, 0), expression=seq_get(args, 1))
+
+
+def _date_add_sql(self: generator.Generator, expression: exp.DateAdd) -> str:
+ """Generate SQL for date addition in Impala."""
+ unit = expression.text("unit").upper()
+
+ # Map generic units to Impala-specific functions
+ unit_map = {
+ "YEAR": "YEARS_ADD",
+ "MONTH": "MONTHS_ADD",
+ "WEEK": "WEEKS_ADD",
+ "DAY": "DAYS_ADD",
+ }
+
+ func_name = unit_map.get(unit, "DATE_ADD")
+
+ if func_name != "DATE_ADD":
+ return self.func(func_name, expression.this, expression.expression)
+
+ # For other units, use DATE_ADD with INTERVAL
+ return self.func(
+ "DATE_ADD",
+ expression.this,
+ self.sql(exp.Interval(this=expression.expression, unit=expression.unit)),
+ )
+
+
+def _date_sub_sql(self: generator.Generator, expression: exp.DateSub) -> str:
+ """Generate SQL for date subtraction in Impala."""
+ unit = expression.text("unit").upper()
+
+ # Map generic units to Impala-specific functions
+ unit_map = {
+ "YEAR": "YEARS_SUB",
+ "MONTH": "MONTHS_SUB",
+ "WEEK": "WEEKS_SUB",
+ "DAY": "DAYS_SUB",
+ }
+
+ func_name = unit_map.get(unit, "DATE_SUB")
+
+ if func_name != "DATE_SUB":
+ return self.func(func_name, expression.this, expression.expression)
+
+ # For other units, use DATE_SUB with INTERVAL
+ return self.func(
+ "DATE_SUB",
+ expression.this,
+ self.sql(exp.Interval(this=expression.expression, unit=expression.unit)),
+ )
+
+
+def _lateral_sql(self: generator.Generator, expression: exp.Lateral) -> str:
+ """Generate SQL for LATERAL expressions in Impala."""
+ # Impala doesn't support LATERAL VIEW syntax
+ # It uses regular JOIN with UNNEST instead
+ this = expression.this
+
+ if isinstance(this, exp.Unnest):
+ # Just return the UNNEST expression without LATERAL VIEW
+ return self.sql(this)
+
+ # For other cases, try to generate standard SQL
+ return self.sql(this)
diff --git a/superset/sql/parse.py b/superset/sql/parse.py
index 391bf0d..5295713 100644
--- a/superset/sql/parse.py
+++ b/superset/sql/parse.py
@@ -44,7 +44,7 @@
)
from superset.exceptions import QueryClauseValidationException, SupersetParseError
-from superset.sql.dialects import Dremio, Firebolt
+from superset.sql.dialects import Dremio, Firebolt, Impala
if TYPE_CHECKING:
from superset.models.core import Database
@@ -81,7 +81,7 @@
"hana": Dialects.POSTGRES,
"hive": Dialects.HIVE,
# "ibmi": ???
- # "impala": ???
+ "impala": Impala,
# "kustosql": ???
# "kylin": ???
"mariadb": Dialects.MYSQL,
diff --git a/tests/unit_tests/sql/dialects/impala_tests.py b/tests/unit_tests/sql/dialects/impala_tests.py
new file mode 100644
index 0000000..5e1618d
--- /dev/null
+++ b/tests/unit_tests/sql/dialects/impala_tests.py
@@ -0,0 +1,319 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+from sqlglot import exp, parse_one
+
+from superset.sql.dialects.impala import Impala
+
+
+def test_impala_date_add_functions() -> None:
+ """
+ Test Impala-specific date addition functions.
+ """
+ # Test MONTHS_ADD
+ sql = "SELECT MONTHS_ADD(date_col, 3) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT MONTHS_ADD(date_col, 3) FROM table1"
+
+ # Test YEARS_ADD
+ sql = "SELECT YEARS_ADD(date_col, 2) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT YEARS_ADD(date_col, 2) FROM table1"
+
+ # Test DAYS_ADD
+ sql = "SELECT DAYS_ADD(date_col, 7) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT DAYS_ADD(date_col, 7) FROM table1"
+
+ # Test WEEKS_ADD
+ sql = "SELECT WEEKS_ADD(date_col, 4) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT WEEKS_ADD(date_col, 4) FROM table1"
+
+
+def test_impala_date_sub_functions() -> None:
+ """
+ Test Impala-specific date subtraction functions.
+ """
+ # Test MONTHS_SUB
+ sql = "SELECT MONTHS_SUB(date_col, 3) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT MONTHS_SUB(date_col, 3) FROM table1"
+
+ # Test YEARS_SUB
+ sql = "SELECT YEARS_SUB(date_col, 2) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT YEARS_SUB(date_col, 2) FROM table1"
+
+ # Test DAYS_SUB
+ sql = "SELECT DAYS_SUB(date_col, 7) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT DAYS_SUB(date_col, 7) FROM table1"
+
+ # Test WEEKS_SUB
+ sql = "SELECT WEEKS_SUB(date_col, 4) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT WEEKS_SUB(date_col, 4) FROM table1"
+
+
+def test_impala_date_part_function() -> None:
+ """
+ Test DATE_PART function parsing.
+ """
+ # Test DATE_PART with YEAR
+ sql = "SELECT DATE_PART('YEAR', date_col) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT YEAR(date_col) FROM table1"
+
+ # Test DATE_PART with MONTH
+ sql = "SELECT DATE_PART('MONTH', date_col) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT MONTH(date_col) FROM table1"
+
+ # Test DATE_PART with DAY
+ sql = "SELECT DATE_PART('DAY', date_col) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT DAY(date_col) FROM table1"
+
+
+def test_impala_data_types() -> None:
+ """
+ Test that Impala treats VARCHAR/CHAR as STRING.
+ """
+ # Test VARCHAR conversion
+ sql = "CREATE TABLE test (col1 VARCHAR(100))"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "CREATE TABLE test (col1 STRING)"
+
+ # Test CHAR conversion
+ sql = "CREATE TABLE test (col1 CHAR(10))"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "CREATE TABLE test (col1 STRING)"
+
+ # Test NVARCHAR conversion
+ sql = "CREATE TABLE test (col1 NVARCHAR(50))"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "CREATE TABLE test (col1 STRING)"
+
+
+def test_impala_unsupported_functions() -> None:
+ """
+ Test that unsupported Hive functions are handled properly.
+ """
+ # STR_TO_MAP is not supported
+ with pytest.raises(Exception) as exc_info:
+ sql = "SELECT STR_TO_MAP('a:1,b:2', ',', ':') FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ ast.sql(dialect=Impala)
+ assert "STR_TO_MAP is not supported" in str(exc_info.value)
+
+ # TRANSFORM is not supported
+ with pytest.raises(Exception) as exc_info:
+ sql = "SELECT TRANSFORM(col) USING 'script.py' FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ ast.sql(dialect=Impala)
+ assert "TRANSFORM is not supported" in str(exc_info.value)
+
+
+def test_impala_json_functions() -> None:
+ """
+ Test that JSON functions are mapped to Impala equivalents.
+ """
+ # Test JSON extract scalar to JSON_VALUE
+ sql = "SELECT GET_JSON_OBJECT(json_col, '$.field') FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT JSON_VALUE(json_col, '$.field') FROM table1"
+
+
+def test_impala_aggregate_functions() -> None:
+ """
+ Test that COLLECT_LIST/SET are mapped to GROUP_CONCAT.
+ """
+ # Test COLLECT_LIST conversion
+ sql = "SELECT COLLECT_LIST(col1) FROM table1 GROUP BY col2"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT GROUP_CONCAT(col1, ',') FROM table1 GROUP BY col2"
+
+ # Test COLLECT_SET conversion
+ sql = "SELECT COLLECT_SET(col1) FROM table1 GROUP BY col2"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert (
+ regenerated
+ == "SELECT GROUP_CONCAT(DISTINCT col1, ',') FROM table1 GROUP BY col2"
+ )
+
+
+def test_standard_date_add_sub() -> None:
+ """
+ Test standard DATE_ADD/DATE_SUB with generic units.
+ """
+ # Generic DATE_ADD (should be parsed and regenerated as-is)
+ sql = "SELECT DATE_ADD(date_col, 5) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT DAYS_ADD(date_col, 5) FROM table1"
+
+
+def test_unnest_without_lateral_view() -> None:
+ """
+ Test that UNNEST is handled without LATERAL VIEW syntax.
+ """
+ # Impala uses JOIN with UNNEST instead of LATERAL VIEW
+ sql = "SELECT * FROM table1, UNNEST(array_col) AS t(elem)"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ # Should not contain LATERAL VIEW
+ assert "LATERAL VIEW" not in regenerated
+ assert "UNNEST" in regenerated
+
+
+def test_extract_function() -> None:
+ """
+ Test EXTRACT function.
+ """
+ sql = "SELECT EXTRACT(YEAR FROM date_col) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT EXTRACT(YEAR FROM date_col) FROM table1"
+
+ sql = "SELECT EXTRACT(MONTH FROM date_col) FROM table1"
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+ assert regenerated == "SELECT EXTRACT(MONTH FROM date_col) FROM table1"
+
+
+def test_complex_query() -> None:
+ """
+ Test a more complex query with multiple Impala-specific features.
+ """
+ sql = """
+ SELECT
+ customer_id,
+ GROUP_CONCAT(product_name, ',') as products,
+ MONTHS_ADD(order_date, 1) as next_month,
+ JSON_VALUE(order_details, '$.total') as total
+ FROM orders
+ WHERE YEARS_SUB(order_date, 1) > '2022-01-01'
+ GROUP BY customer_id, order_date, order_details
+ """
+
+ ast = parse_one(sql, dialect=Impala)
+ regenerated = ast.sql(dialect=Impala)
+
+ # Check key components are preserved
+ assert "GROUP_CONCAT" in regenerated
+ assert "MONTHS_ADD" in regenerated
+ assert "JSON_VALUE" in regenerated
+ assert "YEARS_SUB" in regenerated
+
+
+@pytest.mark.parametrize(
+ "func, expected_class, unit",
+ [
+ ("MONTHS_ADD(x, 1)", exp.DateAdd, "MONTH"),
+ ("YEARS_SUB(x, 2)", exp.DateSub, "YEAR"),
+ ("DAYS_ADD(x, 3)", exp.DateAdd, "DAY"),
+ ("WEEKS_SUB(x, 4)", exp.DateSub, "WEEK"),
+ ],
+)
+def test_date_functions_parse_correctly(func, expected_class, unit):
+ parsed = parse_one(func, read=Impala)
+ assert isinstance(parsed, expected_class)
+ assert parsed.text("unit").upper() == unit
+
+
+@pytest.mark.parametrize(
+ "sql, expected_expr",
+ [
+ ("DATE_PART('year', d)", exp.Year),
+ ("DATE_PART('second', d)", exp.Second),
+ ("EXTRACT(year FROM d)", exp.Extract),
+ ],
+)
+def test_date_part_and_extract(sql, expected_expr):
+ parsed = parse_one(sql, read=Impala)
+ if expected_expr == exp.Extract:
+ assert isinstance(parsed, expected_expr)
+ else:
+ assert isinstance(parsed, expected_expr)
+
+
+@pytest.mark.parametrize(
+ "expr, expected_sql",
+ [
+ (
+ exp.DateAdd(
+ this=exp.Column(this="x"),
+ expression=exp.Literal.number(1),
+ unit=exp.Literal.string("YEAR"),
+ ),
+ "YEARS_ADD(x, 1)",
+ ),
+ (
+ exp.DateSub(
+ this=exp.Column(this="y"),
+ expression=exp.Literal.number(2),
+ unit=exp.Literal.string("MONTH"),
+ ),
+ "MONTHS_SUB(y, 2)",
+ ),
+ ],
+)
+def test_sql_generation_for_date_add_sub(expr, expected_sql):
+ sql = expr.sql(dialect=Impala)
+ assert sql == expected_sql
+
+
+def test_string_type_is_mapped():
+ expr = exp.DataType.build("VARCHAR")
+ sql = expr.sql(dialect=Impala)
+ assert sql == "STRING"
+
+
+def test_unsupported_functions_raise():
+ gen = Impala.Generator()
+ with pytest.raises(Exception):
+ gen.sql(exp.StrToMap(this=exp.Literal.string("x")))
+
+
+@pytest.mark.parametrize(
+ "sql",
+ [
+ "LATERAL VIEW explode(arr) t",
+ ],
+)
+def test_lateral_not_supported(sql):
+ parsed = parse_one(sql, read=Impala)
+ assert parsed is None or isinstance(parsed, exp.Expression)