tests/unit_tests/pandas_postprocessing/test_flatten.py - superset - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import pandas as pd

 from superset.utils import pandas_postprocessing as pp
 from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR
 from tests.unit_tests.fixtures.dataframes import timeseries_df


 def test_flat_should_not_change():
     df = pd.DataFrame(
         data={
             "foo": [1, 2, 3],
             "bar": [4, 5, 6],
         }
     )

     assert pp.flatten(df).equals(df)


 def test_flat_should_not_reset_index():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]})

     assert pp.flatten(df, reset_index=False).equals(df)


 def test_flat_should_flat_datetime_index():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]})

     assert pp.flatten(df).equals(
         pd.DataFrame(
             {
                 "__timestamp": index,
                 "foo": [1, 2, 3],
                 "bar": [4, 5, 6],
             }
         )
     )


 def test_flat_should_flat_multiple_index():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     iterables = [["foo", "bar"], [1, "two"]]
     columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
     df = pd.DataFrame(index=index, columns=columns, data=1)

     assert pp.flatten(df).equals(
         pd.DataFrame(
             {
                 "__timestamp": index,
                 FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1],
             }
         )
     )


 def test_flat_should_drop_index_level():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     columns = pd.MultiIndex.from_arrays(
         [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]],
         names=["level1", "level2", "level3", "level4"],
     )
     df = pd.DataFrame(index=index, columns=columns, data=1)

     # drop level by index
     assert pp.flatten(
         df.copy(),
         drop_levels=(
             0,
             1,
         ),
     ).equals(
         pd.DataFrame(
             {
                 "__timestamp": index,
                 FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
             }
         )
     )

     # drop level by name
     assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals(
         pd.DataFrame(
             {
                 "__timestamp": index,
                 FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
             }
         )
     )

     # only leave 1 level
     assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals(
         pd.DataFrame(
             {
                 "__timestamp": index,
                 FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1],
                 FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1],
             }
         )
     )


 def test_flat_should_not_droplevel():
     assert pp.flatten(timeseries_df, drop_levels=(0,)).equals(
         pd.DataFrame(
             {
                 "index": pd.to_datetime(
                     ["2019-01-01", "2019-01-02", "2019-01-05", "2019-01-07"]
                 ),
                 "label": ["x", "y", "z", "q"],
                 "y": [1.0, 2.0, 3.0, 4.0],
             }
         )
     )


 def test_flat_integer_column_name():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     columns = pd.MultiIndex.from_arrays(
         [["a"] * 3, [100, 200, 300]],
         names=["level1", "level2"],
     )
     df = pd.DataFrame(index=index, columns=columns, data=1)
     assert pp.flatten(df, drop_levels=(0,)).equals(
         pd.DataFrame(
             {
                 "__timestamp": pd.to_datetime(
                     ["2021-01-01", "2021-01-02", "2021-01-03"]
                 ),
                 "100": [1, 1, 1],
                 "200": [1, 1, 1],
                 "300": [1, 1, 1],
             }
         )
     )


 def test_escape_column_name():
     index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
     index.name = "__timestamp"
     columns = pd.MultiIndex.from_arrays(
         [
             ["level1,value1", "level1,value2", "level1,value3"],
             ["level2, value1", "level2, value2", "level2, value3"],
         ],
         names=["level1", "level2"],
     )
     df = pd.DataFrame(index=index, columns=columns, data=1)
     assert list(pp.flatten(df).columns.values) == [
         "__timestamp",
         "level1\\,value1" + FLAT_COLUMN_SEPARATOR + "level2\\, value1",
         "level1\\,value2" + FLAT_COLUMN_SEPARATOR + "level2\\, value2",
         "level1\\,value3" + FLAT_COLUMN_SEPARATOR + "level2\\, value3",
     ]
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import pandas as pd

	from superset.utils import pandas_postprocessing as pp
	from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR
	from tests.unit_tests.fixtures.dataframes import timeseries_df


	def test_flat_should_not_change():
	df = pd.DataFrame(
	data={
	"foo": [1, 2, 3],
	"bar": [4, 5, 6],
	}
	)

	assert pp.flatten(df).equals(df)


	def test_flat_should_not_reset_index():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]})

	assert pp.flatten(df, reset_index=False).equals(df)


	def test_flat_should_flat_datetime_index():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	df = pd.DataFrame(index=index, data={"foo": [1, 2, 3], "bar": [4, 5, 6]})

	assert pp.flatten(df).equals(
	pd.DataFrame(
	{
	"__timestamp": index,
	"foo": [1, 2, 3],
	"bar": [4, 5, 6],
	}
	)
	)


	def test_flat_should_flat_multiple_index():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	iterables = [["foo", "bar"], [1, "two"]]
	columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
	df = pd.DataFrame(index=index, columns=columns, data=1)

	assert pp.flatten(df).equals(
	pd.DataFrame(
	{
	"__timestamp": index,
	FLAT_COLUMN_SEPARATOR.join(["foo", "1"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["foo", "two"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["bar", "1"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["bar", "two"]): [1, 1, 1],
	}
	)
	)


	def test_flat_should_drop_index_level():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	columns = pd.MultiIndex.from_arrays(
	[["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]],
	names=["level1", "level2", "level3", "level4"],
	)
	df = pd.DataFrame(index=index, columns=columns, data=1)

	# drop level by index
	assert pp.flatten(
	df.copy(),
	drop_levels=(
	0,
	1,
	),
	).equals(
	pd.DataFrame(
	{
	"__timestamp": index,
	FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
	}
	)
	)

	# drop level by name
	assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals(
	pd.DataFrame(
	{
	"__timestamp": index,
	FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1],
	}
	)
	)

	# only leave 1 level
	assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals(
	pd.DataFrame(
	{
	"__timestamp": index,
	FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1],
	FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1],
	}
	)
	)


	def test_flat_should_not_droplevel():
	assert pp.flatten(timeseries_df, drop_levels=(0,)).equals(
	pd.DataFrame(
	{
	"index": pd.to_datetime(
	["2019-01-01", "2019-01-02", "2019-01-05", "2019-01-07"]
	),
	"label": ["x", "y", "z", "q"],
	"y": [1.0, 2.0, 3.0, 4.0],
	}
	)
	)


	def test_flat_integer_column_name():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	columns = pd.MultiIndex.from_arrays(
	[["a"] * 3, [100, 200, 300]],
	names=["level1", "level2"],
	)
	df = pd.DataFrame(index=index, columns=columns, data=1)
	assert pp.flatten(df, drop_levels=(0,)).equals(
	pd.DataFrame(
	{
	"__timestamp": pd.to_datetime(
	["2021-01-01", "2021-01-02", "2021-01-03"]
	),
	"100": [1, 1, 1],
	"200": [1, 1, 1],
	"300": [1, 1, 1],
	}
	)
	)


	def test_escape_column_name():
	index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
	index.name = "__timestamp"
	columns = pd.MultiIndex.from_arrays(
	[
	["level1,value1", "level1,value2", "level1,value3"],
	["level2, value1", "level2, value2", "level2, value3"],
	],
	names=["level1", "level2"],
	)
	df = pd.DataFrame(index=index, columns=columns, data=1)
	assert list(pp.flatten(df).columns.values) == [
	"__timestamp",
	"level1\\,value1" + FLAT_COLUMN_SEPARATOR + "level2\\, value1",
	"level1\\,value2" + FLAT_COLUMN_SEPARATOR + "level2\\, value2",
	"level1\\,value3" + FLAT_COLUMN_SEPARATOR + "level2\\, value3",
	]