| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """Tests for datetime format detection and warning suppression.""" |
| |
| import warnings |
| |
| import pandas as pd |
| import pytest |
| |
| from superset.utils.core import DateColumn, normalize_dttm_col |
| from superset.utils.pandas import detect_datetime_format |
| |
| |
| def capture_warnings(func, *args, **kwargs): |
| """Execute function and return any format inference warnings.""" |
| with warnings.catch_warnings(record=True) as w: |
| warnings.simplefilter("always") |
| result = func(*args, **kwargs) |
| format_warnings = [ |
| str(warning.message) |
| for warning in w |
| if "Could not infer format" in str(warning.message) |
| ] |
| return result, format_warnings |
| |
| |
| def test_detect_datetime_format(): |
| """Test format detection for common datetime patterns.""" |
| test_cases = [ |
| (["2023-01-01", "2023-01-02"], "%Y-%m-%d"), |
| (["2023-01-01 12:00:00", "2023-01-02 13:00:00"], "%Y-%m-%d %H:%M:%S"), |
| (["01/15/2023", "02/20/2023"], "%m/%d/%Y"), |
| (["2023-01-01", "01/02/2023"], None), # Mixed formats |
| ([], None), # Empty |
| ([None, None], None), # All nulls |
| ] |
| |
| for data, expected in test_cases: |
| assert detect_datetime_format(pd.Series(data)) == expected |
| |
| |
| def test_no_warnings_with_consistent_formats(): |
| """Verify no warnings are produced for consistent date formats.""" |
| df = pd.DataFrame( |
| { |
| "date": ["2023-01-01", "2023-01-02", "2023-01-03"], |
| "datetime": [ |
| "2023-01-01 12:00:00", |
| "2023-01-02 13:00:00", |
| "2023-01-03 14:00:00", |
| ], |
| } |
| ) |
| |
| date_cols = ( |
| DateColumn(col_label="date"), |
| DateColumn(col_label="datetime"), |
| ) |
| |
| _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols) |
| assert len(warnings_list) == 0 |
| |
| # Verify parsing worked |
| assert pd.api.types.is_datetime64_any_dtype(df["date"]) |
| assert pd.api.types.is_datetime64_any_dtype(df["datetime"]) |
| assert df["date"].iloc[0] == pd.Timestamp("2023-01-01") |
| |
| |
| def test_explicit_format_respected(): |
| """Verify explicit formats are still used when provided.""" |
| df = pd.DataFrame({"date": ["01/15/2023", "02/20/2023"]}) |
| date_cols = (DateColumn(col_label="date", timestamp_format="%m/%d/%Y"),) |
| |
| normalize_dttm_col(df, date_cols) |
| |
| assert pd.api.types.is_datetime64_any_dtype(df["date"]) |
| assert df["date"].iloc[0] == pd.Timestamp("2023-01-15") |
| |
| |
| def test_mixed_formats_suppressed(): |
| """Verify warnings are suppressed for mixed format data.""" |
| df = pd.DataFrame( |
| { |
| "mixed": ["2023-01-01", "01/02/2023", "2023-03-01 12:00:00"], |
| } |
| ) |
| |
| date_cols = (DateColumn(col_label="mixed"),) |
| _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols) |
| |
| assert len(warnings_list) == 0 |
| assert pd.api.types.is_datetime64_any_dtype(df["mixed"]) |
| |
| |
| def test_epoch_format(): |
| """Verify epoch timestamp handling works correctly.""" |
| df = pd.DataFrame({"epoch": [1672531200, 1672617600]}) # 2023-01-01, 2023-01-02 |
| date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),) |
| |
| normalize_dttm_col(df, date_cols) |
| |
| assert pd.api.types.is_datetime64_any_dtype(df["epoch"]) |
| assert df["epoch"].iloc[0] == pd.Timestamp("2023-01-01") |
| |
| |
| def test_epoch_format_invalid_values(caplog): |
| """Test epoch format with invalid values triggers warning.""" |
| # Test with non-numeric values that can't be converted to epoch |
| df = pd.DataFrame({"epoch": ["not_a_number", "invalid", "abc"]}) |
| date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),) |
| |
| # Clear any existing log records |
| caplog.clear() |
| |
| # Run the function - should log a warning |
| with caplog.at_level("WARNING"): |
| normalize_dttm_col(df, date_cols) |
| |
| # Verify warning was logged |
| assert "Unable to convert column epoch to datetime, ignoring" in caplog.text |
| |
| # The column should remain unchanged when conversion fails |
| assert df["epoch"].dtype == object |
| assert df["epoch"].iloc[0] == "not_a_number" |
| |
| |
| @pytest.mark.parametrize( |
| "data,expected_format", |
| [ |
| (["2023-01-01", "2023-01-02"], "%Y-%m-%d"), |
| (["01/15/2023", "02/20/2023"], "%m/%d/%Y"), |
| (["2023-01-01T12:00:00Z", "2023-01-02T13:00:00Z"], "%Y-%m-%dT%H:%M:%SZ"), |
| ( |
| ["2023-01-01T12:00:00.123Z", "2023-01-02T13:00:00.456Z"], |
| "%Y-%m-%dT%H:%M:%S.%fZ", |
| ), |
| ], |
| ) |
| def test_format_detection_patterns(data: list[str], expected_format: str): |
| """Test detection of various datetime formats.""" |
| assert detect_datetime_format(pd.Series(data)) == expected_format |
| |
| |
| def test_edge_cases(): |
| """Test handling of edge cases.""" |
| edge_cases = [ |
| pd.DataFrame({"date": []}), # Empty |
| pd.DataFrame({"date": [None, None]}), # All nulls |
| pd.DataFrame({"date": ["2023-01-01"]}), # Single value |
| pd.DataFrame({"date": pd.to_datetime(["2023-01-01"])}), # Already datetime |
| ] |
| |
| for df in edge_cases: |
| df_copy = df.copy() |
| date_cols = (DateColumn(col_label="date"),) |
| # Should not raise |
| normalize_dttm_col(df_copy, date_cols) |
| |
| |
| def test_detect_datetime_format_empty_series(): |
| """Test detect_datetime_format returns None for empty series after dropping NaN.""" |
| # Test with all None values - covers lines 50-51 in pandas.py |
| series_all_none = pd.Series([None, None, None]) |
| assert detect_datetime_format(series_all_none) is None |
| |
| # Test with all NaN values |
| series_all_nan = pd.Series([pd.NaT, pd.NaT, pd.NaT]) |
| assert detect_datetime_format(series_all_nan) is None |
| |
| # Test with empty series |
| series_empty = pd.Series([], dtype=object) |
| assert detect_datetime_format(series_empty) is None |
| |
| |
| def test_datetime_conversion_value_error(caplog, monkeypatch): |
| """Test ValueError during datetime conversion logs a warning. |
| |
| Covers core.py lines 1887-88. |
| """ |
| # Create a DataFrame with string values representing dates that are |
| # already datetime-like but when epoch_s format is specified and the |
| # values are NOT numeric, it tries to convert them using pd.Timestamp |
| # which can fail |
| |
| # Create a mock type that raises ValueError when pd.Timestamp is called on it |
| class BadTimestampValue: |
| def __init__(self, value): |
| self.value = value |
| |
| def __repr__(self): |
| return f"BadTimestamp({self.value})" |
| |
| def __bool__(self): |
| return True |
| |
| # Create DataFrame with values that will fail pd.Timestamp conversion |
| df = pd.DataFrame( |
| { |
| "date": [ |
| BadTimestampValue("2023-01-01"), |
| BadTimestampValue("2023-01-02"), |
| BadTimestampValue("2023-01-03"), |
| ] |
| } |
| ) |
| |
| # Store original Timestamp |
| original_timestamp = pd.Timestamp |
| |
| def failing_timestamp(value): |
| if isinstance(value, BadTimestampValue): |
| raise ValueError(f"Cannot convert {value} to Timestamp") |
| return original_timestamp(value) |
| |
| # Set to epoch format with non-numeric data to trigger the else branch |
| # (lines 1881-1891 in core.py) |
| date_cols = (DateColumn(col_label="date", timestamp_format="epoch_s"),) |
| |
| # Clear any existing log records |
| caplog.clear() |
| |
| # Run the function with our patched Timestamp - should log a warning |
| with caplog.at_level("WARNING"): |
| # Use monkeypatch for cleaner patching |
| monkeypatch.setattr(pd, "Timestamp", failing_timestamp) |
| normalize_dttm_col(df, date_cols) |
| |
| # Verify warning was logged (covers lines 1887-88 in core.py) |
| assert "Unable to convert column date to datetime, ignoring" in caplog.text |
| |
| |
| def test_warning_suppression(): |
| """Verify our implementation suppresses warnings for mixed formats.""" |
| df = pd.DataFrame({"date": ["2023-01-01", "01/02/2023", "March 3, 2023"]}) |
| |
| # Our approach should suppress warnings |
| _, warnings_list = capture_warnings( |
| normalize_dttm_col, df, (DateColumn(col_label="date"),) |
| ) |
| |
| assert len(warnings_list) == 0 # Should suppress all format inference warnings |
| assert pd.api.types.is_datetime64_any_dtype(df["date"]) # Should still parse dates |