tests/unit_tests/commands/databases/csv_reader_test.py - superset - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import io
 from datetime import datetime

 import numpy as np
 import pytest
 from werkzeug.datastructures import FileStorage

 from superset.commands.database.exceptions import DatabaseUploadFailed
 from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
 from tests.unit_tests.fixtures.common import create_csv_file

 CSV_DATA = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30", "city1", "1990-02-01"],
     ["name2", "25", "city2", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_CHANGED_HEADER = [
     ["name1", "30", "city1", "1990-02-01"],
     ["Name", "Age", "City", "Birth"],
     ["name2", "25", "city2", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_WITH_NULLS = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "N/A", "city1", "1990-02-01"],
     ["name2", "25", "None", "1995-02-01"],
     ["name3", "20", "city3", "2000-02-01"],
 ]

 CSV_DATA_DAY_FIRST = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30", "city1", "01-02-1990"],
 ]

 CSV_DATA_DECIMAL_CHAR = [
     ["Name", "Age", "City", "Birth"],
     ["name1", "30,1", "city1", "1990-02-01"],
 ]

 CSV_DATA_SKIP_INITIAL_SPACE = [
     ["         Name", "Age", "City", "Birth"],
     ["      name1", "30", "city1", "1990-02-01"],
 ]


 @pytest.mark.parametrize(
     "file, options, expected_cols, expected_values",
     [
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA, delimiter="|"),
             CSVReaderOptions(delimiter="|"),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=["Name", "Age"],
             ),
             ["Name", "Age"],
             [
                 ["name1", 30],
                 ["name2", 25],
                 ["name3", 20],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=["Name", "Age"],
                 column_data_types={"Age": "float"},
             ),
             ["Name", "Age"],
             [
                 ["name1", 30.0],
                 ["name2", 25.0],
                 ["name3", 20.0],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=[],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 columns_read=[],
                 column_data_types={"Age": "float"},
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.0, "city1", "1990-02-01"],
                 ["name2", 25.0, "city2", "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 rows_to_read=1,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.0, "city1", "1990-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 rows_to_read=1,
                 columns_read=["Name", "Age"],
             ),
             ["Name", "Age"],
             [
                 ["name1", 30.0],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 skip_rows=1,
             ),
             ["name1", "30", "city1", "1990-02-01"],
             [
                 ["name2", 25.0, "city2", "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA),
             CSVReaderOptions(
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
                 ["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
                 ["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_CHANGED_HEADER),
             CSVReaderOptions(
                 header_row=1,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name2", 25, "city2", "1995-02-01"],
                 ["name3", 20, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_WITH_NULLS),
             CSVReaderOptions(
                 null_values=["N/A", "None"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", np.nan, "city1", "1990-02-01"],
                 ["name2", 25.0, np.nan, "1995-02-01"],
                 ["name3", 20.0, "city3", "2000-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DAY_FIRST),
             CSVReaderOptions(
                 day_first=False,
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DAY_FIRST),
             CSVReaderOptions(
                 day_first=True,
                 column_dates=["Birth"],
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
             ],
         ),
         (
             create_csv_file(CSV_DATA_DECIMAL_CHAR),
             CSVReaderOptions(
                 decimal_character=",",
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30.1, "city1", "1990-02-01"],
             ],
         ),
         (
             create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
             CSVReaderOptions(
                 skip_initial_space=True,
             ),
             ["Name", "Age", "City", "Birth"],
             [
                 ["name1", 30, "city1", "1990-02-01"],
             ],
         ),
     ],
 )
 def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
     csv_reader = CSVReader(
         options=options,
     )
     df = csv_reader.file_to_dataframe(file)
     assert df.columns.tolist() == expected_cols
     actual_values = df.values.tolist()
     for i in range(len(expected_values)):
         for j in range(len(expected_values[i])):
             expected_val = expected_values[i][j]
             actual_val = actual_values[i][j]

             # Check if both values are NaN
             if isinstance(expected_val, float) and isinstance(actual_val, float):
                 assert np.isnan(expected_val) == np.isnan(actual_val)
             else:
                 assert expected_val == actual_val
     file.close()


 def test_csv_reader_index_column():
     csv_reader = CSVReader(
         options=CSVReaderOptions(index_column="Name"),
     )
     df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert df.index.name == "Name"


 def test_csv_reader_wrong_index_column():
     csv_reader = CSVReader(
         options=CSVReaderOptions(index_column="wrong"),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert str(ex.value) == "Parsing error: Index wrong invalid"


 def test_csv_reader_broken_file_no_columns():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file([""]))
     assert str(ex.value) == "Parsing error: No columns to parse from file"


 def test_csv_reader_wrong_columns_to_read():
     csv_reader = CSVReader(
         options=CSVReaderOptions(columns_read=["xpto"]),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
     assert str(ex.value) == (
         "Parsing error: Usecols do not match columns, "
         "columns expected but not found: ['xpto']"
     )


 def test_csv_reader_invalid_file():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(
             FileStorage(
                 io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
             )
         )
     assert str(ex.value) == (
         "Parsing error: Error tokenizing data. C error:"
         " Expected 3 fields in line 3, saw 7\n"
     )


 def test_csv_reader_invalid_encoding():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
     assert str(ex.value) == (
         "Parsing error: 'utf-8' codec can't decode byte 0xba in"
         " position 21: invalid start byte"
     )


 def test_csv_reader_file_metadata():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     file = create_csv_file(CSV_DATA)
     metadata = csv_reader.file_metadata(file)
     assert metadata == {
         "items": [
             {"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
         ]
     }
     file.close()

     file = create_csv_file(CSV_DATA, delimiter="|")
     csv_reader = CSVReader(
         options=CSVReaderOptions(delimiter="|"),
     )
     metadata = csv_reader.file_metadata(file)
     assert metadata == {
         "items": [
             {"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
         ]
     }
     file.close()


 def test_csv_reader_file_metadata_invalid_file():
     csv_reader = CSVReader(
         options=CSVReaderOptions(),
     )
     with pytest.raises(DatabaseUploadFailed) as ex:
         csv_reader.file_metadata(
             FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
         )
     assert str(ex.value) == (
         "Parsing error: Error tokenizing data. C error:"
         " Expected 3 fields in line 3, saw 7\n"
     )
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import io
	from datetime import datetime

	import numpy as np
	import pytest
	from werkzeug.datastructures import FileStorage

	from superset.commands.database.exceptions import DatabaseUploadFailed
	from superset.commands.database.uploaders.csv_reader import CSVReader, CSVReaderOptions
	from tests.unit_tests.fixtures.common import create_csv_file

	CSV_DATA = [
	["Name", "Age", "City", "Birth"],
	["name1", "30", "city1", "1990-02-01"],
	["name2", "25", "city2", "1995-02-01"],
	["name3", "20", "city3", "2000-02-01"],
	]

	CSV_DATA_CHANGED_HEADER = [
	["name1", "30", "city1", "1990-02-01"],
	["Name", "Age", "City", "Birth"],
	["name2", "25", "city2", "1995-02-01"],
	["name3", "20", "city3", "2000-02-01"],
	]

	CSV_DATA_WITH_NULLS = [
	["Name", "Age", "City", "Birth"],
	["name1", "N/A", "city1", "1990-02-01"],
	["name2", "25", "None", "1995-02-01"],
	["name3", "20", "city3", "2000-02-01"],
	]

	CSV_DATA_DAY_FIRST = [
	["Name", "Age", "City", "Birth"],
	["name1", "30", "city1", "01-02-1990"],
	]

	CSV_DATA_DECIMAL_CHAR = [
	["Name", "Age", "City", "Birth"],
	["name1", "30,1", "city1", "1990-02-01"],
	]

	CSV_DATA_SKIP_INITIAL_SPACE = [
	[" Name", "Age", "City", "Birth"],
	[" name1", "30", "city1", "1990-02-01"],
	]


	@pytest.mark.parametrize(
	"file, options, expected_cols, expected_values",
	[
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", "1990-02-01"],
	["name2", 25, "city2", "1995-02-01"],
	["name3", 20, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA, delimiter="\|"),
	CSVReaderOptions(delimiter="\|"),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", "1990-02-01"],
	["name2", 25, "city2", "1995-02-01"],
	["name3", 20, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	columns_read=["Name", "Age"],
	),
	["Name", "Age"],
	[
	["name1", 30],
	["name2", 25],
	["name3", 20],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	columns_read=["Name", "Age"],
	column_data_types={"Age": "float"},
	),
	["Name", "Age"],
	[
	["name1", 30.0],
	["name2", 25.0],
	["name3", 20.0],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	columns_read=[],
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", "1990-02-01"],
	["name2", 25, "city2", "1995-02-01"],
	["name3", 20, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	columns_read=[],
	column_data_types={"Age": "float"},
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30.0, "city1", "1990-02-01"],
	["name2", 25.0, "city2", "1995-02-01"],
	["name3", 20.0, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	rows_to_read=1,
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30.0, "city1", "1990-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	rows_to_read=1,
	columns_read=["Name", "Age"],
	),
	["Name", "Age"],
	[
	["name1", 30.0],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	skip_rows=1,
	),
	["name1", "30", "city1", "1990-02-01"],
	[
	["name2", 25.0, "city2", "1995-02-01"],
	["name3", 20.0, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA),
	CSVReaderOptions(
	column_dates=["Birth"],
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
	["name2", 25, "city2", datetime(1995, 2, 1, 0, 0)],
	["name3", 20, "city3", datetime(2000, 2, 1, 0, 0)],
	],
	),
	(
	create_csv_file(CSV_DATA_CHANGED_HEADER),
	CSVReaderOptions(
	header_row=1,
	),
	["Name", "Age", "City", "Birth"],
	[
	["name2", 25, "city2", "1995-02-01"],
	["name3", 20, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA_WITH_NULLS),
	CSVReaderOptions(
	null_values=["N/A", "None"],
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", np.nan, "city1", "1990-02-01"],
	["name2", 25.0, np.nan, "1995-02-01"],
	["name3", 20.0, "city3", "2000-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA_DAY_FIRST),
	CSVReaderOptions(
	day_first=False,
	column_dates=["Birth"],
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", datetime(1990, 1, 2, 0, 0)],
	],
	),
	(
	create_csv_file(CSV_DATA_DAY_FIRST),
	CSVReaderOptions(
	day_first=True,
	column_dates=["Birth"],
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", datetime(1990, 2, 1, 0, 0)],
	],
	),
	(
	create_csv_file(CSV_DATA_DECIMAL_CHAR),
	CSVReaderOptions(
	decimal_character=",",
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30.1, "city1", "1990-02-01"],
	],
	),
	(
	create_csv_file(CSV_DATA_SKIP_INITIAL_SPACE),
	CSVReaderOptions(
	skip_initial_space=True,
	),
	["Name", "Age", "City", "Birth"],
	[
	["name1", 30, "city1", "1990-02-01"],
	],
	),
	],
	)
	def test_csv_reader_file_to_dataframe(file, options, expected_cols, expected_values):
	csv_reader = CSVReader(
	options=options,
	)
	df = csv_reader.file_to_dataframe(file)
	assert df.columns.tolist() == expected_cols
	actual_values = df.values.tolist()
	for i in range(len(expected_values)):
	for j in range(len(expected_values[i])):
	expected_val = expected_values[i][j]
	actual_val = actual_values[i][j]

	# Check if both values are NaN
	if isinstance(expected_val, float) and isinstance(actual_val, float):
	assert np.isnan(expected_val) == np.isnan(actual_val)
	else:
	assert expected_val == actual_val
	file.close()


	def test_csv_reader_index_column():
	csv_reader = CSVReader(
	options=CSVReaderOptions(index_column="Name"),
	)
	df = csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
	assert df.index.name == "Name"


	def test_csv_reader_wrong_index_column():
	csv_reader = CSVReader(
	options=CSVReaderOptions(index_column="wrong"),
	)
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
	assert str(ex.value) == "Parsing error: Index wrong invalid"


	def test_csv_reader_broken_file_no_columns():
	csv_reader = CSVReader(
	options=CSVReaderOptions(),
	)
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_to_dataframe(create_csv_file([""]))
	assert str(ex.value) == "Parsing error: No columns to parse from file"


	def test_csv_reader_wrong_columns_to_read():
	csv_reader = CSVReader(
	options=CSVReaderOptions(columns_read=["xpto"]),
	)
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_to_dataframe(create_csv_file(CSV_DATA))
	assert str(ex.value) == (
	"Parsing error: Usecols do not match columns, "
	"columns expected but not found: ['xpto']"
	)


	def test_csv_reader_invalid_file():
	csv_reader = CSVReader(
	options=CSVReaderOptions(),
	)
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_to_dataframe(
	FileStorage(
	io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"), filename=""
	)
	)
	assert str(ex.value) == (
	"Parsing error: Error tokenizing data. C error:"
	" Expected 3 fields in line 3, saw 7\n"
	)


	def test_csv_reader_invalid_encoding():
	csv_reader = CSVReader(
	options=CSVReaderOptions(),
	)
	binary_data = b"col1,col2,col3\nv1,v2,\xba\nv3,v4,v5\n"
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_to_dataframe(FileStorage(io.BytesIO(binary_data)))
	assert str(ex.value) == (
	"Parsing error: 'utf-8' codec can't decode byte 0xba in"
	" position 21: invalid start byte"
	)


	def test_csv_reader_file_metadata():
	csv_reader = CSVReader(
	options=CSVReaderOptions(),
	)
	file = create_csv_file(CSV_DATA)
	metadata = csv_reader.file_metadata(file)
	assert metadata == {
	"items": [
	{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
	]
	}
	file.close()

	file = create_csv_file(CSV_DATA, delimiter="\|")
	csv_reader = CSVReader(
	options=CSVReaderOptions(delimiter="\|"),
	)
	metadata = csv_reader.file_metadata(file)
	assert metadata == {
	"items": [
	{"column_names": ["Name", "Age", "City", "Birth"], "sheet_name": None}
	]
	}
	file.close()


	def test_csv_reader_file_metadata_invalid_file():
	csv_reader = CSVReader(
	options=CSVReaderOptions(),
	)
	with pytest.raises(DatabaseUploadFailed) as ex:
	csv_reader.file_metadata(
	FileStorage(io.StringIO("c1,c2,c3\na,b,c\n1,2,3,4,5,6,7\n1,2,3"))
	)
	assert str(ex.value) == (
	"Parsing error: Error tokenizing data. C error:"
	" Expected 3 fields in line 3, saw 7\n"
	)