tests/unit_tests/commands/databases/csv_reader_test.py - superset - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import io
 from datetime import datetime

 import numpy as np
 import pandas as pd
 import pytest
 from werkzeug.datastructures import FileStorage

 from superset.commands.database.exceptions import DatabaseUploadFailed
 from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
 from tests.unit_tests.fixtures.common import create_csv_file

 CSV_DATA = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30", "city1", "1990-02-01"],
     ["name2", "25", "city2", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_CHANGED_HEADER = [
     ["name1", "30", "city1", "1990-02-01"],
     ["Name", "Age", "City", "Birth"],
     ["name2", "25", "city2", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_WITH_NULLS = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "N/A", "city1", "1990-02-01"],
     ["name2", "25", "None", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_DAY_FIRST = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30", "city1", "01-02-1990"],
 ]

 CSV_DATA_DECIMAL_CHAR = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30,1", "city1", "1990-02-01"],
 ]

 CSV_DATA_SKIP_INITIAL_SPACE = [
     ["         Name", "Age", "City", "Birth"],
     ["      name1", "30", "city1", "1990-02-01"],
 ]


 @pytest.mark.parametrize(
     "file, options, expected_cols, expected_values",
     [
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA, delimiter="|"),
             CSVReaderOptions(delimiter="|"),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=["Name", "Age"],
             ),
             ["Name", "Age"],
             [
                 ["name1", 30],
                 ["name2", 25],
                 ["name3", 20],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=["Name", "Age"],
                 column_data_types={"Age": "float"},
             ),
             ["Name", "Age"],
             [
                 ["name1", 30.0],
                 ["name2", 25.0],
                 ["name3", 20.0],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=[],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=[],
                 column_data_types={"Age": "float"},
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.0, "city1", "1990-02-01"],
                 ["name2", 25.0, "city2", "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 rows_to_read=1,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.0, "city1", "1990-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 rows_to_read=1,
                 columns_read=["Name", "Age"],
             ),
             ["Name", "Age"],
             [
                 ["name1", 30.0],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 skip_rows=1,
             ),
             ["name1", "30", "city1", "1990-02-01"],
             [
                 ["name2", 25.0, "city2", "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
                 ["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
                 ["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_CHANGED_HEADER),
             CSVReaderOptions(
                 header_row=1,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_WITH_NULLS),
             CSVReaderOptions(
                 null_values=["N/A", "None"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", np.nan, "city1", "1990-02-01"],
                 ["name2", 25.0, np.nan, "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DAY_FIRST),
             CSVReaderOptions(
                 day_first=False,
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DAY_FIRST),
             CSVReaderOptions(
                 day_first=True,
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DECIMAL_CHAR),
             CSVReaderOptions(
                 decimal_character=",",
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.1, "city1", "1990-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
             CSVReaderOptions(
                 skip_initial_space=True,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
             ],
         ),
     ],
 )
 def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
     csv_reader = CSVReader(
         options=options,
     )
     df = csv_reader.file_to_dataframe(file)
     assert df.columns.tolist() == expected_cols
     actual_values = df.values.tolist()
     for i in range(len(expected_values)):
         for j in range(len(expected_values[i])):
             expected_val = expected_values[i][j]
             actual_val = actual_values[i][j]

             # Check if both values are NaN
             if isinstance(expected_val, float) and isinstance(actual_val, float):
                 assert np.isnan(expected_val) == np.isnan(actual_val)
             else:
                 assert expected_val == actual_val
     file.close()


 def test_csv_reader_index_column():
     csv_reader = CSVReader(
         options=CSVReaderOptions(index_column="Name"),
     )
     df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert df.index.name == "Name"


 def test_csv_reader_wrong_index_column():
     csv_reader = CSVReader(
         options=CSVReaderOptions(index_column="wrong"),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert str(ex.value) == "Parsing error: Index wrong invalid"


 def test_csv_reader_broken_file_no_columns():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file([""]))
     assert str(ex.value) == "Parsing error: No columns to parse from file"


 def test_csv_reader_wrong_columns_to_read():
     csv_reader = CSVReader(
         options=CSVReaderOptions(columns_read=["xpto"]),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert str(ex.value) == (
         "Parsing error: Usecols do not match columns, "
         "columns expected but not found: ['xpto']"
     )


 def test_csv_reader_invalid_file():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(
             FileStorage(
                 io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
             )
         )
     assert str(ex.value) == (
         "Parsing error: Error tokenizing data. C error:"
         " Expected 3 fields in line 3, saw 7\n"
     )


 def test_csv_reader_invalid_encoding():
     """Test that encoding detection automatically handles problematic encoding."""
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
     # The new encoding detection should automatically handle this
     df = csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert df.columns.tolist() == ["col1", "col2", "col3"]
     assert len(df) == 2  # Should have 2 data rows


 def test_csv_reader_encoding_detection_latin1():
     """Test automatic encoding detection for Latin-1 encoded files."""
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     # Create a Latin-1 encoded file with special characters
     binary_data = "col1,col2,col3\nCafé,Résumé,naïve\n".encode("latin-1")
     df = csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert df.columns.tolist() == ["col1", "col2", "col3"]
     assert df.values.tolist() == [["Café", "Résumé", "naïve"]]


 def test_csv_reader_encoding_detection_iso88591():
     """Test automatic encoding detection for ISO-8859-1 encoded files."""
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     # Create an ISO-8859-1 encoded file with special characters
     binary_data = "col1,col2\nCafé,naïve\n".encode("iso-8859-1")
     df = csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert df.columns.tolist() == ["col1", "col2"]
     assert df.values.tolist() == [["Café", "naïve"]]


 def test_csv_reader_explicit_encoding():
     """Test that explicit encoding is respected."""
     csv_reader = CSVReader(
         options=CSVReaderOptions(encoding="latin-1"),
     )
     # Create a Latin-1 encoded file
     binary_data = "col1,col2\nCafé,naïve\n".encode("latin-1")
     df = csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert df.columns.tolist() == ["col1", "col2"]
     assert df.values.tolist() == [["Café", "naïve"]]


 def test_csv_reader_encoding_detection_failure():
     """Test that undecodable files raise appropriate error."""
     csv_reader = CSVReader(
         options=CSVReaderOptions(encoding="ascii"),  # Force ASCII encoding
     )
     # Create data that can't be decoded as ASCII
     binary_data = b"col1,col2\n\xff\xfe,test\n"
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert "Parsing error" in str(ex.value)


 def test_csv_reader_file_metadata():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     file = create_csv_file(CSV_DATA)
     metadata = csv_reader.file_metadata(file)
     assert metadata == {
         "items": [
             {"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
         ]
     }
     file.close()

     file = create_csv_file(CSV_DATA, delimiter="|")
     csv_reader = CSVReader(
         options=CSVReaderOptions(delimiter="|"),
     )
     metadata = csv_reader.file_metadata(file)
     assert metadata == {
         "items": [
             {"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
         ]
     }
     file.close()


 def test_csv_reader_file_metadata_invalid_file():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_metadata(
             FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
         )
     assert str(ex.value) == (
         "Parsing error: Error tokenizing data. C error:"
         " Expected 3 fields in line 3, saw 7\n"
     )


 def test_csv_reader_integer_in_float_column():
     csv_data = [
         ["Name", "Score", "City"],
         ["name1", 25.5, "city1"],
         ["name2", 25, "city2"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Score": "float"})
     )

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (2, 3)
     assert df["Score"].dtype == "float64"


 def test_csv_reader_object_type_auto_inferring():
     # this case below won't raise a error
     csv_data = [
         ["Name", "id", "City"],
         ["name1", 25.5, "city1"],
         ["name2", 15, "city2"],
         ["name3", 123456789086, "city3"],
         ["name4", "abc", "city4"],
         ["name5", 4.75, "city5"],
     ]

     csv_reader = CSVReader()

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (5, 3)
     # pandas automatically infers the type if column_data_types is not informed
     # if there's only one string in the column it converts the whole column to object
     assert df["id"].dtype == "object"


 def test_csv_reader_float_type_auto_inferring():
     csv_data = [
         ["Name", "id", "City"],
         ["name1", "25", "city1"],
         ["name2", "15", "city2"],
         ["name3", "123456789086", "city3"],
         ["name5", "4.75", "city5"],
     ]

     csv_reader = CSVReader()

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (4, 3)
     # The type here is automatically inferred to float due to 4.75 value
     assert df["id"].dtype == "float64"


 def test_csv_reader_int_type_auto_inferring():
     csv_data = [
         ["Name", "id", "City"],
         ["name1", "0", "city1"],
         ["name2", "15", "city2"],
         ["name3", "123456789086", "city3"],
         ["name5", "45", "city5"],
     ]

     csv_reader = CSVReader()

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (4, 3)
     assert df["id"].dtype == "int64"


 def test_csv_reader_bigint_type_auto_inferring():
     csv_data = [
         ["Name", "id", "City"],
         ["name1", "9223372036854775807", "city1"],
         ["name2", "9223372036854775806", "city2"],
         ["name3", "1234567890123456789", "city3"],
         ["name4", "0", "city4"],
         ["name5", "-9223372036854775808", "city5"],
     ]

     csv_reader = CSVReader()

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (5, 3)
     assert df["id"].dtype == "int64"
     assert df.iloc[0]["id"] == 9223372036854775807
     assert df.iloc[4]["id"] == -9223372036854775808


 def test_csv_reader_int_typing():
     csv_data = [
         ["Name", "id", "City"],
         ["name1", "0", "city1"],
         ["name2", "15", "city2"],
         ["name3", "123456789086", "city3"],
         ["name5", "45", "city5"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"id": "int"}))

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (4, 3)
     assert df["id"].dtype == "int64"


 def test_csv_reader_float_typing():
     csv_data = [
         ["Name", "score", "City"],
         ["name1", "0", "city1"],
         ["name2", "15.3", "city2"],
         ["name3", "45", "city3"],
         ["name5", "23.1342", "city5"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"score": "float"})
     )

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (4, 3)
     assert df["score"].dtype == "float64"


 def test_csv_reader_multiple_errors_display():
     """Test that multiple errors are displayed with proper formatting."""
     csv_data = [
         ["Name", "Age", "Score"],
         ["Alice", "25", "95.5"],
         ["Bob", "invalid1", "87.2"],
         ["Charlie", "invalid2", "92.1"],
         ["Diana", "invalid3", "88.5"],
         ["Eve", "invalid4", "90.0"],
         ["Frank", "30", "85.5"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 4 error(s):" in error_msg
     assert "Line 3: 'invalid1' cannot be converted to int64" in error_msg
     assert "Line 4: 'invalid2' cannot be converted to int64" in error_msg
     assert "Line 5: 'invalid3' cannot be converted to int64" in error_msg
     assert "Line 6: 'invalid4' cannot be converted to int64" in error_msg
     # With MAX_DISPLAYED_ERRORS = 5, all 4 errors should be shown without truncation
     assert "and" not in error_msg or "more error(s)" not in error_msg


 def test_csv_reader_non_numeric_in_integer_column():
     csv_data = [
         ["Name", "Age", "City"],
         ["name1", "abc", "city1"],
         ["name2", "25", "city2"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 2: 'abc' cannot be converted to int64" in error_msg


 def test_csv_reader_non_numeric_in_float_column():
     csv_data = [
         ["Name", "Score", "City"],
         ["name1", "5.3", "city1"],
         ["name2", "25.5", "city2"],
         ["name3", "24.5", "city3"],
         ["name4", "1.0", "city4"],
         ["name5", "one point five", "city5"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Score": "float64"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Score' to float64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 6: 'one point five' cannot be converted to float64" in error_msg


 def test_csv_reader_improved_error_detection_int32():
     """Test improved error detection for int32 type casting."""
     csv_data = [
         ["Name", "ID", "City"],
         ["name1", "123", "city1"],
         ["name2", "456", "city2"],
         ["name3", "not_a_number", "city3"],
         ["name4", "789", "city4"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"ID": "int32"}))

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'ID' to int32" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'not_a_number' cannot be converted to int32" in error_msg


 def test_csv_reader_improved_error_detection_float32():
     """Test improved error detection for float32 type casting."""
     csv_data = [
         ["Name", "Score", "City"],
         ["name1", "1.5", "city1"],
         ["name2", "2.7", "city2"],
         ["name3", "invalid_float", "city3"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Score": "float32"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Score' to float32" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'invalid_float' cannot be converted to float32" in error_msg


 def test_csv_reader_error_detection_with_header_row():
     """Test that line numbers are correctly calculated with custom header row."""
     csv_data = [
         ["skip_this_row", "skip", "skip"],
         ["Name", "Age", "City"],
         ["name1", "25", "city1"],
         ["name2", "invalid_age", "city2"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(header_row=1, column_data_types={"Age": "int"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'invalid_age' cannot be converted to int" in error_msg


 def test_csv_reader_error_detection_first_row_error():
     """Test error detection when the first data row has the error."""

     csv_data = [
         ["Name", "Age", "City"],
         ["name1", "not_a_number", "city1"],
         ["name2", "25", "city2"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 2: 'not_a_number' cannot be converted to int64" in error_msg


 def test_csv_reader_error_detection_missing_column():
     """Test that missing columns are handled gracefully."""
     csv_data = [
         ["Name", "City"],
         ["name1", "city1"],
         ["name2", "city2"],
     ]

     # Try to cast a column that doesn't exist
     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"NonExistent": "int64"})
     )

     # Should not raise an error for missing columns
     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
     assert df.shape == (2, 2)
     assert df.columns.tolist() == ["Name", "City"]


 def test_csv_reader_error_detection_mixed_valid_invalid():
     csv_data = [
         ["Name", "Score", "City"],
         ["name1", "95.5", "city1"],
         ["name2", "87.2", "city2"],
         ["name3", "92.1", "city3"],
         ["name4", "eighty-five", "city4"],
         ["name5", "78.9", "city5"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Score": "float64"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Score' to float64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg


 def test_csv_reader_error_detection_multiple_invalid_values():
     """Test error detection with multiple invalid values showing first 5 + count."""
     csv_data = [
         ["Name", "Score", "City"],
         ["name1", "95.5", "city1"],
         ["name2", "87.2", "city2"],
         ["name3", "92.1", "city3"],
         ["name4", "eighty-five", "city4"],
         ["name4", "eighty-one", "city4"],
         ["name4", "eighty", "city4"],
         ["name4", "one", "city4"],
         ["name4", "two", "city4"],
         ["name4", "three", "city4"],
         ["name5", "78.9", "city5"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Score": "float64"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Score' to float64" in error_msg
     assert "Found 6 error(s):" in error_msg
     assert "Line 5: 'eighty-five' cannot be converted to float64" in error_msg
     assert "Line 6: 'eighty-one' cannot be converted to float64" in error_msg
     assert "Line 7: 'eighty' cannot be converted to float64" in error_msg
     assert "Line 8: 'one' cannot be converted to float64" in error_msg
     assert "Line 9: 'two' cannot be converted to float64" in error_msg
     assert "and 1 more error(s)" in error_msg


 def test_csv_reader_error_detection_non_numeric_types():
     """Test error detection for non-numeric type casting."""
     csv_data = [
         ["Name", "Status", "City"],
         ["name1", "active", "city1"],
         ["name2", "inactive", "city2"],
         ["name3", 123, "city3"],  # This should cause an error when casting to string
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Status": "string"})
     )

     # For non-numeric types, the error detection should still work
     # but might have different behavior depending on pandas version
     try:
         df = csv_reader.file_to_dataframe(create_csv_file(csv_data))
         # If no error is raised, the conversion succeeded
         assert df["Status"].dtype == "string"
     except DatabaseUploadFailed as ex:
         # If an error is raised, it should have proper formatting
         error_msg = str(ex.value)
         assert "Cannot convert" in error_msg
         assert "Status" in error_msg


 def test_csv_reader_error_detection_with_null_values():
     csv_data = [
         ["Name", "Age", "City"],
         ["name1", "25", "city1"],
         ["name2", "", "city2"],
         ["name3", "invalid_age", "city3"],
     ]

     csv_reader = CSVReader(options=CSVReaderOptions(column_data_types={"Age": "int64"}))

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg


 def test_csv_reader_successful_numeric_conversion():
     csv_data = [
         ["Name", "Age", "Score", "ID"],
         ["name1", "25", "95.5", "1001"],
         ["name2", "30", "87.2", "1002"],
         ["name3", "35", "92.1", "1003"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(
             column_data_types={
                 "Age": "int64",
                 "Score": "float64",
                 "ID": "int32",
             }
         )
     )

     df = csv_reader.file_to_dataframe(create_csv_file(csv_data))

     assert df.shape == (3, 4)
     assert df["Age"].dtype == "int64"
     assert df["Score"].dtype == "float64"
     assert df["ID"].dtype == "int32"
     assert df.iloc[0]["Age"] == 25
     assert df.iloc[0]["Score"] == 95.5
     assert df.iloc[0]["ID"] == 1001


 def test_csv_reader_error_detection_improvements_summary():
     csv_data_with_custom_header = [
         ["metadata_row", "skip", "this"],
         ["Name", "Age", "Score"],
         ["Alice", "25", "95.5"],
         ["Bob", "invalid_age", "87.2"],
         ["Charlie", "30", "92.1"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(
             header_row=1, column_data_types={"Age": "int64", "Score": "float64"}
         )
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data_with_custom_header))

     error_msg = str(ex.value)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg

     # Test case 2: Multiple type errors - Age comes first alphabetically
     csv_data_multiple_errors = [
         ["Name", "Age", "Score"],
         ["Alice", "25", "95.5"],
         ["Bob", "invalid_age", "invalid_score"],  # Error in both columns (line 3)
         ["Charlie", "30", "92.1"],
     ]

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_data_types={"Age": "int64", "Score": "float64"})
     )

     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(csv_data_multiple_errors))

     error_msg = str(ex.value)
     # Should catch the Age error first (Age comes before Score alphabetically)
     assert "Cannot convert column 'Age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 3: 'invalid_age' cannot be converted to int64" in error_msg


 def test_csv_reader_cast_column_types_function():
     """Test the _cast_column_types function directly for better isolation."""
     # Create test DataFrame
     test_data = {
         "name": ["Alice", "Bob", "Charlie"],
         "age": ["25", "30", "invalid_age"],
         "score": ["95.5", "87.2", "92.1"],
     }
     df = pd.DataFrame(test_data)

     # Test successful casting
     types_success = {"age": "int64", "score": "float64"}
     kwargs = {"header": 0}

     # This should work for first two rows, but we'll only test the first two
     df_subset = df.iloc[:2].copy()
     result_df = CSVReader._cast_column_types(df_subset, types_success, kwargs)

     assert result_df["age"].dtype == "int64"
     assert result_df["score"].dtype == "float64"
     assert result_df.iloc[0]["age"] == 25
     assert result_df.iloc[0]["score"] == 95.5

     # Test error case
     with pytest.raises(DatabaseUploadFailed) as ex:
         CSVReader._cast_column_types(df, types_success, kwargs)

     error_msg = str(ex.value)
     assert "Cannot convert column 'age' to int64" in error_msg
     assert "Found 1 error(s):" in error_msg
     assert "Line 4: 'invalid_age' cannot be converted to int64" in error_msg


 def test_csv_reader_cast_column_types_missing_column():
     """Test _cast_column_types with missing columns."""
     test_data = {
         "name": ["Alice", "Bob"],
         "age": ["25", "30"],
     }
     df = pd.DataFrame(test_data)

     # Try to cast a column that doesn't exist
     types = {"age": "int64", "nonexistent": "float64"}
     kwargs = {"header": 0}

     # Should not raise an error for missing columns
     result_df = CSVReader._cast_column_types(df, types, kwargs)
     assert result_df["age"].dtype == "int64"
     assert "nonexistent" not in result_df.columns


 def test_csv_reader_cast_column_types_different_numeric_types():
     """Test _cast_column_types with various numeric types."""
     test_data = {
         "int32_col": ["1", "2", "3"],
         "int64_col": ["100", "200", "300"],
         "float32_col": ["1.5", "2.5", "3.5"],
         "float64_col": ["10.1", "20.2", "30.3"],
     }
     df = pd.DataFrame(test_data)

     types = {
         "int32_col": "int32",
         "int64_col": "int64",
         "float32_col": "float32",
         "float64_col": "float64",
     }
     kwargs = {"header": 0}

     result_df = CSVReader._cast_column_types(df, types, kwargs)

     assert result_df["int32_col"].dtype == "int32"
     assert result_df["int64_col"].dtype == "int64"
     assert result_df["float32_col"].dtype == "float32"
     assert result_df["float64_col"].dtype == "float64"


 def test_csv_reader_chunking_large_file():
     """Test that chunking is used for large files."""
     # Create a large CSV with more than 100k rows
     large_data = [["col1", "col2", "col3"]]
     for i in range(100001):
         large_data.append([f"val{i}", str(i), f"data{i}"])

     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     df = csv_reader.file_to_dataframe(create_csv_file(large_data))
     assert len(df) == 100001
     assert df.columns.tolist() == ["col1", "col2", "col3"]
     assert df.iloc[0].tolist() == ["val0", 0, "data0"]
     assert df.iloc[-1].tolist() == ["val100000", 100000, "data100000"]


 def test_csv_reader_chunking_with_rows_limit():
     """Test that chunking respects rows_to_read limit."""
     # Create a CSV with more than the chunk size
     large_data = [["col1", "col2"]]
     for i in range(60000):  # More than chunk size of 50000
         large_data.append([f"val{i}", str(i)])

     csv_reader = CSVReader(
         options=CSVReaderOptions(rows_to_read=55000),
     )
     df = csv_reader.file_to_dataframe(create_csv_file(large_data))
     assert len(df) == 55000
     assert df.columns.tolist() == ["col1", "col2"]


 def test_csv_reader_no_chunking_small_file():
     """Test that chunking is not used for small files."""
     # Create a small CSV (less than 2 * chunk size)
     small_data = [["col1", "col2"]]
     for i in range(1000):  # Much less than chunk size
         small_data.append([f"val{i}", str(i)])

     csv_reader = CSVReader(
         options=CSVReaderOptions(rows_to_read=1000),
     )
     df = csv_reader.file_to_dataframe(create_csv_file(small_data))
     assert len(df) == 1000
     assert df.columns.tolist() == ["col1", "col2"]


 def test_csv_reader_engine_selection():
     """Test engine selection based on feature flag."""
     from unittest.mock import MagicMock, patch

     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )

     # Test 1: Feature flag disabled (default) - should use c engine
     with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
         with patch(
             "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
         ) as mock_flag:
             mock_flag.return_value = False
             mock_pd.__version__ = "2.0.0"
             mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1, 2, 3]}))
             mock_pd.DataFrame = pd.DataFrame

             file = create_csv_file([["col1"], ["1"], ["2"], ["3"]])
             csv_reader.file_to_dataframe(file)

             # Check that c engine is selected when feature flag is disabled
             call_kwargs = mock_pd.read_csv.call_args[1]
             assert call_kwargs.get("engine") == "c"

     # Test 2: Feature flag enabled - pyarrow would be used but chunking prevents it
     with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
         with patch(
             "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
         ) as mock_flag:
             with patch("importlib.util") as mock_util:
                 mock_flag.return_value = True
                 mock_pd.__version__ = "2.0.0"
                 mock_pd.read_csv = MagicMock(
                     return_value=pd.DataFrame({"col1": [1, 2, 3]})
                 )
                 mock_pd.DataFrame = pd.DataFrame
                 mock_pd.concat = MagicMock(
                     return_value=pd.DataFrame({"col1": [1, 2, 3]})
                 )
                 mock_util.find_spec = MagicMock(return_value=True)

                 file = create_csv_file([["col1"], ["1"], ["2"], ["3"]])
                 csv_reader.file_to_dataframe(file)

                 # Check that c engine is selected due to chunking (default behavior)
                 # Even with feature flag enabled, chunking prevents pyarrow usage
                 call_kwargs = mock_pd.read_csv.call_args[1]
                 assert call_kwargs.get("engine") == "c"

     # Test 3: Feature flag enabled but unsupported options - should use c engine
     with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
         with patch(
             "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
         ) as mock_flag:
             mock_flag.return_value = True
             mock_pd.__version__ = "2.0.0"
             mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1, 2, 3]}))
             mock_pd.DataFrame = pd.DataFrame

             # Create reader with date parsing (unsupported by pyarrow)
             csv_reader_with_dates = CSVReader(
                 options=CSVReaderOptions(column_dates=["date_col"]),
             )
             file = create_csv_file([["date_col"], ["2023-01-01"]])
             csv_reader_with_dates.file_to_dataframe(file)

             # Check that c engine is selected due to unsupported options
             call_kwargs = mock_pd.read_csv.call_args[1]
             assert call_kwargs.get("engine") == "c"


 def test_csv_reader_low_memory_setting():
     """Test that low_memory is set to False."""
     from unittest.mock import MagicMock, patch

     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )

     with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
         mock_pd.__version__ = "2.0.0"
         mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1, 2, 3]}))
         mock_pd.DataFrame = pd.DataFrame

         file = create_csv_file([["col1"], ["1"], ["2"], ["3"]])
         csv_reader.file_to_dataframe(file)

         # Check that low_memory=False was set
         call_kwargs = mock_pd.read_csv.call_args[1]
         assert call_kwargs.get("low_memory") is False


 def test_csv_reader_cache_dates_setting():
     """Test that cache_dates is set to True for performance."""
     from unittest.mock import MagicMock, patch

     csv_reader = CSVReader(
         options=CSVReaderOptions(column_dates=["date_col"]),
     )

     with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
         mock_pd.__version__ = "2.0.0"
         mock_pd.read_csv = MagicMock(
             return_value=pd.DataFrame({"date_col": ["2023-01-01"]})
         )
         mock_pd.DataFrame = pd.DataFrame

         file = create_csv_file([["date_col"], ["2023-01-01"]])
         csv_reader.file_to_dataframe(file)

         # Check that cache_dates=True was set
         call_kwargs = mock_pd.read_csv.call_args[1]
         assert call_kwargs.get("cache_dates") is True


 def test_csv_reader_pyarrow_feature_flag():
     """
     Test that the CSV_UPLOAD_PYARROW_ENGINE feature flag controls engine selection.
     """
     import io
     from unittest.mock import MagicMock, patch

     from werkzeug.datastructures import FileStorage

     # Test _read_csv directly to avoid the file_to_dataframe chunking logic
     with patch(
         "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
     ) as mock_flag:
         with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
             with patch.object(
                 CSVReader, "_select_optimal_engine"
             ) as mock_engine_select:
                 # Test 1: FF enabled, pyarrow available, no unsupported options
                 mock_flag.return_value = True
                 mock_pd.__version__ = "2.0.0"
                 mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1]}))
                 mock_engine_select.return_value = "pyarrow"

                 # Create clean kwargs without any problematic options
                 clean_kwargs = {
                     "encoding": "utf-8",
                     "low_memory": False,
                     # No chunksize, iterator, nrows, parse_dates, or na_values
                 }

                 file = FileStorage(io.StringIO("col1\nval1"))
                 CSVReader._read_csv(file, clean_kwargs)

                 # Verify feature flag was checked
                 mock_flag.assert_called_with("CSV_UPLOAD_PYARROW_ENGINE")

                 # Verify engine selection method was called
                 mock_engine_select.assert_called_once()

                 # Verify pyarrow engine was selected
                 call_kwargs = mock_pd.read_csv.call_args[1]
                 assert call_kwargs.get("engine") == "pyarrow"

     # Test 2: Feature flag disabled
     with patch(
         "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
     ) as mock_flag:
         with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
             mock_flag.return_value = False
             mock_pd.__version__ = "2.0.0"
             mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1]}))

             clean_kwargs = {
                 "encoding": "utf-8",
                 "low_memory": False,
             }

             file = FileStorage(io.StringIO("col1\nval1"))
             CSVReader._read_csv(file, clean_kwargs)

             # Verify feature flag was checked
             mock_flag.assert_called_with("CSV_UPLOAD_PYARROW_ENGINE")

             # Verify c engine was selected when flag is disabled
             call_kwargs = mock_pd.read_csv.call_args[1]
             assert call_kwargs.get("engine") == "c"

     # Test 3: Feature flag enabled but unsupported options present
     with patch(
         "superset.commands.database.uploaders.csv_reader.is_feature_enabled"
     ) as mock_flag:
         with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
             mock_flag.return_value = True
             mock_pd.__version__ = "2.0.0"
             mock_pd.read_csv = MagicMock(return_value=pd.DataFrame({"col1": [1]}))

             # Include unsupported options
             unsupported_kwargs = {
                 "encoding": "utf-8",
                 "low_memory": False,
                 "nrows": 100,  # Unsupported by pyarrow
             }

             file = FileStorage(io.StringIO("col1\nval1"))
             CSVReader._read_csv(file, unsupported_kwargs)

             # Verify c engine was selected due to unsupported options
             call_kwargs = mock_pd.read_csv.call_args[1]
             assert call_kwargs.get("engine") == "c"


 def test_csv_reader_select_optimal_engine():
     """Test the _select_optimal_engine method with different scenarios."""
     from unittest.mock import MagicMock, patch

     # Test 1: PyArrow available, no built-in support
     with patch("superset.commands.database.uploaders.csv_reader.util") as mock_util:
         with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
             with patch("superset.commands.database.uploaders.csv_reader.logger"):
                 mock_util.find_spec = MagicMock(
                     return_value=MagicMock()
                 )  # PyArrow found
                 mock_pd.__version__ = "2.0.0"  # No pyarrow in version

                 # Mock successful pyarrow import
                 with patch.dict("sys.modules", {"pyarrow": MagicMock()}):
                     result = CSVReader._select_optimal_engine()
                     assert result == "pyarrow"

     # Test 2: PyArrow not available
     with patch("superset.commands.database.uploaders.csv_reader.util") as mock_util:
         with patch("superset.commands.database.uploaders.csv_reader.logger"):
             mock_util.find_spec = MagicMock(return_value=None)  # PyArrow not found

             result = CSVReader._select_optimal_engine()
             assert result == "c"

     # Test 3: Pandas with built-in pyarrow
     with patch("superset.commands.database.uploaders.csv_reader.util") as mock_util:
         with patch("superset.commands.database.uploaders.csv_reader.pd") as mock_pd:
             with patch("superset.commands.database.uploaders.csv_reader.logger"):
                 mock_util.find_spec = MagicMock(
                     return_value=MagicMock()
                 )  # PyArrow found
                 mock_pd.__version__ = "2.0.0+pyarrow"  # Has pyarrow in version

                 # Mock successful pyarrow import
                 with patch.dict("sys.modules", {"pyarrow": MagicMock()}):
                     result = CSVReader._select_optimal_engine()
                     assert result == "c"

     # Test 4: PyArrow import fails
     with patch("superset.commands.database.uploaders.csv_reader.util") as mock_util:
         with patch("superset.commands.database.uploaders.csv_reader.logger"):
             mock_util.find_spec = MagicMock(return_value=MagicMock())  # PyArrow found

             # Mock import error
             with patch(
                 "builtins.__import__", side_effect=ImportError("PyArrow import failed")
             ):
                 result = CSVReader._select_optimal_engine()
                 assert result == "c"


 def test_csv_reader_progressive_encoding_detection():
     """Test that progressive encoding detection uses multiple sample sizes."""
     import io

     from werkzeug.datastructures import FileStorage

     # Create a file with latin-1 encoding that will require detection
     content = "col1,col2,col3\n" + "café,résumé,naïve\n"
     binary_data = content.encode("latin-1")

     file = FileStorage(io.BytesIO(binary_data))

     # Track read calls to verify progressive sampling
     original_read = file.read
     read_calls = []
     read_sizes = []

     def track_read(size):
         read_calls.append(size)
         read_sizes.append(size)
         file.seek(0)  # Reset position for consistent reading
         result = original_read(size)
         file.seek(0)  # Reset again
         return result

     file.read = track_read

     # Call encoding detection
     detected_encoding = CSVReader._detect_encoding(file)

     # Should detect the correct encoding
     assert detected_encoding in [
         "latin-1",
         "utf-8",
     ], f"Should detect valid encoding, got {detected_encoding}"

     # Should have made multiple read attempts with different sizes
     # (The method tries multiple sample sizes until it finds a working encoding)
     assert len(read_calls) >= 1, f"Should have made read calls, got {read_calls}"

     # Test that the method handles the sample sizes properly
     assert all(size > 0 for size in read_sizes), "All sample sizes should be positive"