| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # pylint: disable=unused-argument, import-outside-toplevel |
| from datetime import datetime |
| |
| import numpy as np |
| import pytest |
| from pandas import Timestamp |
| from pandas._libs.tslibs import NaT |
| |
| from superset.dataframe import df_to_records |
| from superset.db_engine_specs import BaseEngineSpec |
| from superset.result_set import SupersetResultSet |
| from superset.superset_typing import DbapiDescription |
| from superset.utils import json as superset_json |
| |
| |
| def test_df_to_records() -> None: |
| data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] |
| cursor_descr: DbapiDescription = [ |
| (column, "string", None, None, None, None, False) for column in ("a", "b", "c") |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"a": "a1", "b": "b1", "c": "c1"}, |
| {"a": "a2", "b": "b2", "c": "c2"}, |
| ] |
| |
| |
| def test_df_to_records_NaT_type() -> None: # noqa: N802 |
| data = [(NaT,), (Timestamp("2023-01-06 20:50:31.749000+0000", tz="UTC"),)] |
| cursor_descr: DbapiDescription = [ |
| ("date", "timestamp with time zone", None, None, None, None, False) |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"date": None}, |
| {"date": "2023-01-06 20:50:31.749000+00:00"}, |
| ] |
| |
| |
| def test_df_to_records_mixed_emoji_type() -> None: |
| data = [ |
| ("What's up?", "This is a string text", 1), |
| ("What's up?", "This is a string with an 😍 added", 2), |
| ("What's up?", NaT, 3), |
| ("What's up?", "Last emoji 😁", 4), |
| ] |
| |
| cursor_descr: DbapiDescription = [ |
| ("question", "varchar", None, None, None, None, False), |
| ("response", "varchar", None, None, None, None, False), |
| ("count", "integer", None, None, None, None, False), |
| ] |
| |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"question": "What's up?", "response": "This is a string text", "count": 1}, |
| { |
| "question": "What's up?", |
| "response": "This is a string with an 😍 added", |
| "count": 2, |
| }, |
| { |
| "question": "What's up?", |
| "response": None, |
| "count": 3, |
| }, |
| { |
| "question": "What's up?", |
| "response": "Last emoji 😁", |
| "count": 4, |
| }, |
| ] |
| |
| |
| def test_df_to_records_mixed_accent_type() -> None: |
| data = [ |
| ("What's up?", "This is a string text", 1), |
| ("What's up?", "This is a string with áccent", 2), |
| ("What's up?", NaT, 3), |
| ("What's up?", "móre áccent", 4), |
| ] |
| |
| cursor_descr: DbapiDescription = [ |
| ("question", "varchar", None, None, None, None, False), |
| ("response", "varchar", None, None, None, None, False), |
| ("count", "integer", None, None, None, None, False), |
| ] |
| |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"question": "What's up?", "response": "This is a string text", "count": 1}, |
| { |
| "question": "What's up?", |
| "response": "This is a string with áccent", |
| "count": 2, |
| }, |
| { |
| "question": "What's up?", |
| "response": None, |
| "count": 3, |
| }, |
| { |
| "question": "What's up?", |
| "response": "móre áccent", |
| "count": 4, |
| }, |
| ] |
| |
| |
| def test_js_max_int() -> None: |
| data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")] |
| cursor_descr: DbapiDescription = [ |
| ("a", "int", None, None, None, None, False), |
| ("b", "int", None, None, None, None, False), |
| ("c", "string", None, None, None, None, False), |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"a": 1, "b": "1239162456494753670", "c": "c1"}, |
| {"a": 2, "b": 100, "c": "c2"}, |
| ] |
| |
| |
| @pytest.mark.parametrize( |
| "input_, expected", |
| [ |
| pytest.param( |
| [ |
| (datetime.strptime("1677-09-22 00:12:43", "%Y-%m-%d %H:%M:%S"), 1), |
| (datetime.strptime("2262-04-11 23:47:17", "%Y-%m-%d %H:%M:%S"), 2), |
| ], |
| [ |
| { |
| "a": datetime.strptime("1677-09-22 00:12:43", "%Y-%m-%d %H:%M:%S"), |
| "b": 1, |
| }, |
| { |
| "a": datetime.strptime("2262-04-11 23:47:17", "%Y-%m-%d %H:%M:%S"), |
| "b": 2, |
| }, |
| ], |
| id="timestamp conversion fail", |
| ), |
| pytest.param( |
| [ |
| (datetime.strptime("1677-09-22 00:12:44", "%Y-%m-%d %H:%M:%S"), 1), |
| (datetime.strptime("2262-04-11 23:47:16", "%Y-%m-%d %H:%M:%S"), 2), |
| ], |
| [ |
| {"a": Timestamp("1677-09-22 00:12:44"), "b": 1}, |
| {"a": Timestamp("2262-04-11 23:47:16"), "b": 2}, |
| ], |
| id="timestamp conversion success", |
| ), |
| ], |
| ) |
| def test_max_pandas_timestamp(input_, expected) -> None: |
| cursor_descr: DbapiDescription = [ |
| ("a", "datetime", None, None, None, None, False), |
| ("b", "int", None, None, None, None, False), |
| ] |
| results = SupersetResultSet(input_, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == expected |
| |
| |
| def test_df_to_records_with_nan_from_division_by_zero() -> None: |
| """Test that NaN values from division by zero are converted to None.""" |
| # Simulate Athena query: select 0.00 / 0.00 as test |
| data = [(np.nan,), (5.0,), (np.nan,)] |
| cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"test": None}, |
| {"test": 5.0}, |
| {"test": None}, |
| ] |
| |
| |
| def test_df_to_records_with_mixed_nan_and_valid_values() -> None: |
| """Test that NaN values are properly handled alongside valid numeric data.""" |
| |
| # Simulate a query with multiple columns containing NaN values |
| data = [ |
| ("row1", 10.5, np.nan, 100), |
| ("row2", np.nan, 20.3, 200), |
| ("row3", 30.7, 40.2, np.nan), |
| ("row4", np.nan, np.nan, np.nan), |
| ] |
| cursor_descr: DbapiDescription = [ |
| ("name", "varchar", None, None, None, None, False), |
| ("value1", "double", None, None, None, None, False), |
| ("value2", "double", None, None, None, None, False), |
| ("value3", "int", None, None, None, None, False), |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| assert df_to_records(df) == [ |
| {"name": "row1", "value1": 10.5, "value2": None, "value3": 100}, |
| {"name": "row2", "value1": None, "value2": 20.3, "value3": 200}, |
| {"name": "row3", "value1": 30.7, "value2": 40.2, "value3": None}, |
| {"name": "row4", "value1": None, "value2": None, "value3": None}, |
| ] |
| |
| |
| def test_df_to_records_with_inf_and_nan() -> None: |
| """Test that both NaN and infinity values are handled correctly.""" |
| # Test various edge cases: NaN, positive infinity, negative infinity |
| data = [ |
| (np.nan, "division by zero"), |
| (np.inf, "positive infinity"), |
| (-np.inf, "negative infinity"), |
| (0.0, "zero"), |
| (42.5, "normal value"), |
| ] |
| cursor_descr: DbapiDescription = [ |
| ("result", "double", None, None, None, None, False), |
| ("description", "varchar", None, None, None, None, False), |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| records = df_to_records(df) |
| |
| # NaN should be converted to None |
| assert records[0]["result"] is None |
| assert records[0]["description"] == "division by zero" |
| |
| # Infinity values should remain as-is (they're valid JSON) |
| assert records[1]["result"] == np.inf |
| assert records[2]["result"] == -np.inf |
| |
| # Normal values should remain unchanged |
| assert records[3]["result"] == 0.0 |
| assert records[4]["result"] == 42.5 |
| |
| |
| def test_df_to_records_nan_json_serialization() -> None: |
| """ |
| Test that NaN values are properly converted to None for JSON serialization. |
| |
| Without the pd.isna() check, np.nan values would be passed through to JSON |
| serialization, which either produces non-spec-compliant output or requires |
| special handling with ignore_nan flags throughout the codebase. |
| |
| This test validates that our fix converts NaN to None for proper JSON |
| serialization. |
| """ |
| # Simulate Athena query: SELECT 0.00 / 0.00 as test |
| data = [(np.nan,), (5.0,), (np.nan,)] |
| cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| # Get records with our fix |
| records = df_to_records(df) |
| |
| # Verify NaN values are converted to None |
| assert records == [ |
| {"test": None}, # NaN converted to None |
| {"test": 5.0}, |
| {"test": None}, # NaN converted to None |
| ] |
| |
| # This should succeed with valid, spec-compliant JSON |
| json_output = superset_json.dumps(records) |
| parsed = superset_json.loads(json_output) |
| |
| # Verify JSON serialization works correctly |
| assert parsed == records |
| |
| # Demonstrate what happens WITHOUT the fix |
| # (simulate the old behavior by directly using to_dict) |
| records_without_fix = df.to_dict(orient="records") |
| |
| # Verify the records contain actual NaN values (not None) |
| assert np.isnan(records_without_fix[0]["test"]) |
| assert records_without_fix[1]["test"] == 5.0 |
| assert np.isnan(records_without_fix[2]["test"]) |
| |
| # Demonstrate the actual bug: without the fix, ignore_nan=False raises ValueError |
| # This is the error users would see without our fix |
| with pytest.raises( |
| ValueError, match="Out of range float values are not JSON compliant" |
| ): |
| superset_json.dumps(records_without_fix, ignore_nan=False) |
| |
| # With ignore_nan=True, it works by converting NaN to null |
| # But this requires the flag to be set everywhere - our fix eliminates this need |
| json_with_ignore = superset_json.dumps(records_without_fix, ignore_nan=True) |
| parsed_with_ignore = superset_json.loads(json_with_ignore) |
| # The output is the same, but our fix doesn't require the ignore_nan flag |
| assert parsed_with_ignore[0]["test"] is None |
| |
| |
| def test_df_to_records_with_json_serialization_like_sql_lab() -> None: |
| """ |
| Test that mimics the actual SQL Lab serialization flow. |
| This shows how the fix prevents errors in the real usage path. |
| """ |
| # Simulate query with NaN results |
| data = [ |
| ("user1", 100.0, np.nan), |
| ("user2", np.nan, 50.0), |
| ("user3", 75.0, 25.0), |
| ] |
| cursor_descr: DbapiDescription = [ |
| ("name", "varchar", None, None, None, None, False), |
| ("value1", "double", None, None, None, None, False), |
| ("value2", "double", None, None, None, None, False), |
| ] |
| results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) |
| df = results.to_pandas_df() |
| |
| # Mimic sql_lab.py:360 - this is where df_to_records is used |
| records = df_to_records(df) or [] |
| |
| # Mimic sql_lab.py:332 - JSON serialization with Superset's custom json.dumps |
| # This should work without errors |
| json_str = superset_json.dumps( |
| records, default=superset_json.json_iso_dttm_ser, ignore_nan=True |
| ) |
| |
| # Verify it's valid JSON and NaN values are properly handled as null |
| parsed = superset_json.loads(json_str) |
| assert parsed[0]["value2"] is None # NaN became null |
| assert parsed[1]["value1"] is None # NaN became null |
| assert parsed[0]["value1"] == 100.0 |
| |
| # Also verify it works without ignore_nan flag (since we convert NaN to None) |
| json_str_no_flag = superset_json.dumps( |
| records, default=superset_json.json_iso_dttm_ser, ignore_nan=False |
| ) |
| parsed_no_flag = superset_json.loads(json_str_no_flag) |
| assert parsed_no_flag == parsed # Same result |