blob: e1f10516683773fa6f04bc6891d43c500de94e2f [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class FrameRenameMixin:
def test_rename_dataframe(self):
pdf1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
psdf1 = ps.from_pandas(pdf1)
self.assert_eq(
psdf1.rename(columns={"A": "a", "B": "b"}), pdf1.rename(columns={"A": "a", "B": "b"})
)
result_psdf = psdf1.rename(index={1: 10, 2: 20})
result_pdf = pdf1.rename(index={1: 10, 2: 20})
self.assert_eq(result_psdf, result_pdf)
# inplace
pser = result_pdf.A
psser = result_psdf.A
result_psdf.rename(index={10: 100, 20: 200}, inplace=True)
result_pdf.rename(index={10: 100, 20: 200}, inplace=True)
self.assert_eq(result_psdf, result_pdf)
self.assert_eq(psser, pser)
def str_lower(s) -> str:
return str.lower(s)
self.assert_eq(
psdf1.rename(str_lower, axis="columns"), pdf1.rename(str_lower, axis="columns")
)
def mul10(x) -> int:
return x * 10
self.assert_eq(psdf1.rename(mul10, axis="index"), pdf1.rename(mul10, axis="index"))
self.assert_eq(
psdf1.rename(columns=str_lower, index={1: 10, 2: 20}),
pdf1.rename(columns=str_lower, index={1: 10, 2: 20}),
)
self.assert_eq(
psdf1.rename(columns=lambda x: str.lower(x)),
pdf1.rename(columns=lambda x: str.lower(x)),
)
idx = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Y", "D")])
pdf2 = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx)
psdf2 = ps.from_pandas(pdf2)
self.assert_eq(psdf2.rename(columns=str_lower), pdf2.rename(columns=str_lower))
self.assert_eq(
psdf2.rename(columns=lambda x: str.lower(x)),
pdf2.rename(columns=lambda x: str.lower(x)),
)
self.assert_eq(
psdf2.rename(columns=str_lower, level=0), pdf2.rename(columns=str_lower, level=0)
)
self.assert_eq(
psdf2.rename(columns=str_lower, level=1), pdf2.rename(columns=str_lower, level=1)
)
pdf3 = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list("ab"))
psdf3 = ps.from_pandas(pdf3)
self.assert_eq(psdf3.rename(index=str_lower), pdf3.rename(index=str_lower))
self.assert_eq(
psdf3.rename(index=str_lower, level=0), pdf3.rename(index=str_lower, level=0)
)
self.assert_eq(
psdf3.rename(index=str_lower, level=1), pdf3.rename(index=str_lower, level=1)
)
pdf4 = pdf2 + 1
psdf4 = psdf2 + 1
self.assert_eq(psdf4.rename(columns=str_lower), pdf4.rename(columns=str_lower))
pdf5 = pdf3 + 1
psdf5 = psdf3 + 1
self.assert_eq(psdf5.rename(index=str_lower), pdf5.rename(index=str_lower))
msg = "Either `index` or `columns` should be provided."
with self.assertRaisesRegex(ValueError, msg):
psdf1.rename()
msg = "`mapper` or `index` or `columns` should be either dict-like or function type."
with self.assertRaisesRegex(ValueError, msg):
psdf1.rename(mapper=[str_lower], axis=1)
msg = "Mapper dict should have the same value type."
with self.assertRaisesRegex(ValueError, msg):
psdf1.rename({"A": "a", "B": 2}, axis=1)
msg = r"level should be an integer between \[0, column_labels_level\)"
with self.assertRaisesRegex(ValueError, msg):
psdf2.rename(columns=str_lower, level=2)
msg = r"level should be an integer between \[0, 2\)"
with self.assertRaisesRegex(ValueError, msg):
psdf3.rename(index=str_lower, level=2)
def test_rename_axis(self):
index = pd.Index(["A", "B", "C"], name="index")
columns = pd.Index(["numbers", "values"], name="cols")
pdf = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], index=index, columns=columns)
psdf = ps.from_pandas(pdf)
for axis in [0, "index"]:
self.assert_eq(
pdf.rename_axis("index2", axis=axis).sort_index(),
psdf.rename_axis("index2", axis=axis).sort_index(),
)
self.assert_eq(
pdf.rename_axis(["index2"], axis=axis).sort_index(),
psdf.rename_axis(["index2"], axis=axis).sort_index(),
)
for axis in [1, "columns"]:
self.assert_eq(
pdf.rename_axis("cols2", axis=axis).sort_index(),
psdf.rename_axis("cols2", axis=axis).sort_index(),
)
self.assert_eq(
pdf.rename_axis(["cols2"], axis=axis).sort_index(),
psdf.rename_axis(["cols2"], axis=axis).sort_index(),
)
pdf2 = pdf.copy()
psdf2 = psdf.copy()
pdf2.rename_axis("index2", axis="index", inplace=True)
psdf2.rename_axis("index2", axis="index", inplace=True)
self.assert_eq(pdf2.sort_index(), psdf2.sort_index())
self.assertRaises(ValueError, lambda: psdf.rename_axis(["index2", "index3"], axis=0))
self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols2", "cols3"], axis=1))
self.assertRaises(TypeError, lambda: psdf.rename_axis(mapper=["index2"], index=["index3"]))
self.assertRaises(ValueError, lambda: psdf.rename_axis(ps))
self.assert_eq(
pdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(),
psdf.rename_axis(index={"index": "index2"}, columns={"cols": "cols2"}).sort_index(),
)
self.assert_eq(
pdf.rename_axis(index={"missing": "index2"}, columns={"missing": "cols2"}).sort_index(),
psdf.rename_axis(
index={"missing": "index2"}, columns={"missing": "cols2"}
).sort_index(),
)
self.assert_eq(
pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
)
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
)
columns = pd.MultiIndex.from_tuples(
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
)
pdf = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], index=index, columns=columns)
psdf = ps.from_pandas(pdf)
for axis in [0, "index"]:
self.assert_eq(
pdf.rename_axis(["index3", "index4"], axis=axis).sort_index(),
psdf.rename_axis(["index3", "index4"], axis=axis).sort_index(),
)
for axis in [1, "columns"]:
self.assert_eq(
pdf.rename_axis(["cols3", "cols4"], axis=axis).sort_index(),
psdf.rename_axis(["cols3", "cols4"], axis=axis).sort_index(),
)
self.assertRaises(
ValueError, lambda: psdf.rename_axis(["index3", "index4", "index5"], axis=0)
)
self.assertRaises(ValueError, lambda: psdf.rename_axis(["cols3", "cols4", "cols5"], axis=1))
self.assert_eq(
pdf.rename_axis(index={"index1": "index3"}, columns={"cols1": "cols3"}).sort_index(),
psdf.rename_axis(index={"index1": "index3"}, columns={"cols1": "cols3"}).sort_index(),
)
self.assert_eq(
pdf.rename_axis(index={"missing": "index3"}, columns={"missing": "cols3"}).sort_index(),
psdf.rename_axis(
index={"missing": "index3"}, columns={"missing": "cols3"}
).sort_index(),
)
self.assert_eq(
pdf.rename_axis(
index={"index1": "index3", "index2": "index4"},
columns={"cols1": "cols3", "cols2": "cols4"},
).sort_index(),
psdf.rename_axis(
index={"index1": "index3", "index2": "index4"},
columns={"cols1": "cols3", "cols2": "cols4"},
).sort_index(),
)
self.assert_eq(
pdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
psdf.rename_axis(index=str.upper, columns=str.upper).sort_index(),
)
def test_multi_index_rename(self):
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
pdf = pd.DataFrame(np.random.randn(4, 5), idx)
psdf = ps.from_pandas(pdf)
pmidx = pdf.index
psmidx = psdf.index
self.assert_eq(psmidx.rename(["n", "c"]), pmidx.rename(["n", "c"]))
self.assert_eq(psdf.index.names, pdf.index.names)
# non-string names
self.assert_eq(psmidx.rename([0, 1]), pmidx.rename([0, 1]))
self.assert_eq(
psmidx.rename([("x", "a"), ("y", "b")]), pmidx.rename([("x", "a"), ("y", "b")])
)
psmidx.rename(["num", "col"], inplace=True)
pmidx.rename(["num", "col"], inplace=True)
self.assert_eq(psmidx, pmidx)
self.assert_eq(psdf.index.names, pdf.index.names)
self.assert_eq(psmidx.rename([None, None]), pmidx.rename([None, None]))
self.assert_eq(psdf.index.names, pdf.index.names)
self.assertRaises(TypeError, lambda: psmidx.rename("number"))
self.assertRaises(TypeError, lambda: psmidx.rename(None))
self.assertRaises(ValueError, lambda: psmidx.rename(["number"]))
def test_multiindex_rename(self):
pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)])
psidx = ps.from_pandas(pidx)
pidx = pidx.rename(list("ABC"))
psidx = psidx.rename(list("ABC"))
self.assert_eq(pidx, psidx)
pidx = pidx.rename(["my", "name", "is"])
psidx = psidx.rename(["my", "name", "is"])
self.assert_eq(pidx, psidx)
def test_index_rename(self):
pdf = pd.DataFrame(
np.random.randn(10, 5), index=pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], name="x")
)
psdf = ps.from_pandas(pdf)
pidx = pdf.index
psidx = psdf.index
self.assert_eq(psidx.rename("y"), pidx.rename("y"))
self.assert_eq(psdf.index.names, pdf.index.names)
# non-string names
self.assert_eq(psidx.rename(0), pidx.rename(0))
self.assert_eq(psidx.rename(("y", 0)), pidx.rename(("y", 0)))
psidx.rename("z", inplace=True)
pidx.rename("z", inplace=True)
self.assert_eq(psidx, pidx)
self.assert_eq(psdf.index.names, pdf.index.names)
self.assert_eq(psidx.rename(None), pidx.rename(None))
self.assert_eq(psdf.index.names, pdf.index.names)
self.assertRaises(TypeError, lambda: psidx.rename(["x", "y"]))
class FrameRenameTests(
FrameRenameMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass
if __name__ == "__main__":
from pyspark.pandas.tests.indexes.test_rename import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)