blob: dd88d9f47f6c334998fb023fec9f1d69b3388cae [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class FrameReindexMixin:
def test_reindex(self):
index = pd.Index(["A", "B", "C", "D", "E"])
columns = pd.Index(["numbers"])
pdf = pd.DataFrame([1.0, 2.0, 3.0, 4.0, None], index=index, columns=columns)
psdf = ps.from_pandas(pdf)
columns2 = pd.Index(["numbers", "2", "3"], name="cols2")
self.assert_eq(
pdf.reindex(columns=columns2).sort_index(),
psdf.reindex(columns=columns2).sort_index(),
)
columns = pd.Index(["numbers"], name="cols")
pdf.columns = columns
psdf.columns = columns
self.assert_eq(
pdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(),
psdf.reindex(["A", "B", "C"], columns=["numbers", "2", "3"]).sort_index(),
)
# We manually test this due to the bug in pandas.
expected_result = ps.DataFrame([1.0, 2.0, 3.0], index=ps.Index(["A", "B", "C"]))
expected_result.columns = pd.Index(["numbers"], name="cols")
self.assert_eq(
psdf.reindex(["A", "B", "C"], index=["numbers", "2", "3"]).sort_index(),
expected_result,
)
self.assert_eq(
pdf.reindex(index=["A", "B"]).sort_index(), psdf.reindex(index=["A", "B"]).sort_index()
)
self.assert_eq(
pdf.reindex(index=["A", "B", "2", "3"]).sort_index(),
psdf.reindex(index=["A", "B", "2", "3"]).sort_index(),
)
self.assert_eq(
pdf.reindex(index=["A", "E", "2", "3"], fill_value=0).sort_index(),
psdf.reindex(index=["A", "E", "2", "3"], fill_value=0).sort_index(),
)
self.assert_eq(
pdf.reindex(columns=["numbers"]).sort_index(),
psdf.reindex(columns=["numbers"]).sort_index(),
)
self.assert_eq(
pdf.reindex(columns=["numbers"], copy=True).sort_index(),
psdf.reindex(columns=["numbers"], copy=True).sort_index(),
)
# Using float as fill_value to avoid int64/32 clash
self.assert_eq(
pdf.reindex(columns=["numbers", "2", "3"], fill_value=0.0).sort_index(),
psdf.reindex(columns=["numbers", "2", "3"], fill_value=0.0).sort_index(),
)
columns2 = pd.Index(["numbers", "2", "3"])
self.assert_eq(
pdf.reindex(columns=columns2).sort_index(),
psdf.reindex(columns=columns2).sort_index(),
)
columns2 = pd.Index(["numbers", "2", "3"], name="cols2")
self.assert_eq(
pdf.reindex(columns=columns2).sort_index(),
psdf.reindex(columns=columns2).sort_index(),
)
# Reindexing single Index on single Index
pindex2 = pd.Index(["A", "C", "D", "E", "0"], name="index2")
kindex2 = ps.from_pandas(pindex2)
for fill_value in [None, 0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
pindex2 = pd.DataFrame({"index2": ["A", "C", "D", "E", "0"]}).set_index("index2").index
kindex2 = ps.from_pandas(pindex2)
for fill_value in [None, 0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
# Reindexing MultiIndex on single Index
pindex = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("F", "G")], names=["name1", "name2"]
)
kindex = ps.from_pandas(pindex)
self.assert_eq(
pdf.reindex(index=pindex, fill_value=0.0).sort_index(),
psdf.reindex(index=kindex, fill_value=0.0).sort_index(),
)
# Specifying the `labels` parameter
new_index = ["V", "W", "X", "Y", "Z"]
self.assert_eq(
pdf.reindex(labels=new_index, fill_value=0.0, axis=0).sort_index(),
psdf.reindex(labels=new_index, fill_value=0.0, axis=0).sort_index(),
)
self.assert_eq(
pdf.reindex(labels=new_index, fill_value=0.0, axis=1).sort_index(),
psdf.reindex(labels=new_index, fill_value=0.0, axis=1).sort_index(),
)
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["numbers", "2", "3"], axis=1))
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["numbers", "2", "3"], axis=2))
self.assertRaises(TypeError, lambda: psdf.reindex(columns="numbers"))
self.assertRaises(TypeError, lambda: psdf.reindex(index=["A", "B", "C"], axis=1))
self.assertRaises(TypeError, lambda: psdf.reindex(index=123))
# Reindexing MultiIndex on MultiIndex
pdf = pd.DataFrame({"numbers": [1.0, 2.0, None]}, index=pindex)
psdf = ps.from_pandas(pdf)
pindex2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["name1", "name2"]
)
kindex2 = ps.from_pandas(pindex2)
for fill_value in [None, 0.0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
pindex2 = (
pd.DataFrame({"index_level_1": ["A", "C", "I"], "index_level_2": ["G", "D", "J"]})
.set_index(["index_level_1", "index_level_2"])
.index
)
kindex2 = ps.from_pandas(pindex2)
for fill_value in [None, 0.0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
columns = pd.MultiIndex.from_tuples([("X", "numbers")], names=["cols1", "cols2"])
pdf.columns = columns
psdf.columns = columns
# Reindexing MultiIndex index on MultiIndex columns and MultiIndex index
for fill_value in [None, 0.0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
index = pd.Index(["A", "B", "C", "D", "E"])
pdf = pd.DataFrame(data=[1.0, 2.0, 3.0, 4.0, None], index=index, columns=columns)
psdf = ps.from_pandas(pdf)
pindex2 = pd.Index(["A", "C", "D", "E", "0"], name="index2")
kindex2 = ps.from_pandas(pindex2)
# Reindexing single Index on MultiIndex columns and single Index
for fill_value in [None, 0.0]:
self.assert_eq(
pdf.reindex(index=pindex2, fill_value=fill_value).sort_index(),
psdf.reindex(index=kindex2, fill_value=fill_value).sort_index(),
)
for fill_value in [None, 0.0]:
self.assert_eq(
pdf.reindex(
columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")], fill_value=fill_value
).sort_index(),
psdf.reindex(
columns=[("X", "numbers"), ("Y", "2"), ("Y", "3")], fill_value=fill_value
).sort_index(),
)
columns2 = pd.MultiIndex.from_tuples(
[("X", "numbers"), ("Y", "2"), ("Y", "3")], names=["cols3", "cols4"]
)
self.assert_eq(
pdf.reindex(columns=columns2).sort_index(),
psdf.reindex(columns=columns2).sort_index(),
)
self.assertRaises(TypeError, lambda: psdf.reindex(columns=["X"]))
self.assertRaises(ValueError, lambda: psdf.reindex(columns=[("X",)]))
def test_reindex_like(self):
data = [[1.0, 2.0], [3.0, None], [None, 4.0]]
index = pd.Index(["A", "B", "C"], name="index")
columns = pd.Index(["numbers", "values"], name="cols")
pdf = pd.DataFrame(data=data, index=index, columns=columns)
psdf = ps.from_pandas(pdf)
# Reindexing single Index on single Index
data2 = [[5.0, None], [6.0, 7.0], [8.0, None]]
index2 = pd.Index(["A", "C", "D"], name="index2")
columns2 = pd.Index(["numbers", "F"], name="cols2")
pdf2 = pd.DataFrame(data=data2, index=index2, columns=columns2)
psdf2 = ps.from_pandas(pdf2)
self.assert_eq(
pdf.reindex_like(pdf2).sort_index(),
psdf.reindex_like(psdf2).sort_index(),
)
pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]})
psdf2 = ps.from_pandas(pdf2)
self.assert_eq(
pdf.reindex_like(pdf2.set_index(["index_level_1"])).sort_index(),
psdf.reindex_like(psdf2.set_index(["index_level_1"])).sort_index(),
)
# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["name3", "name4"]
)
pdf2 = pd.DataFrame(data=data2, index=index2)
psdf2 = ps.from_pandas(pdf2)
self.assert_eq(
pdf.reindex_like(pdf2).sort_index(),
psdf.reindex_like(psdf2).sort_index(),
)
self.assertRaises(TypeError, lambda: psdf.reindex_like(index2))
self.assertRaises(AssertionError, lambda: psdf2.reindex_like(psdf))
# Reindexing MultiIndex on MultiIndex
columns2 = pd.MultiIndex.from_tuples(
[("numbers", "third"), ("values", "second")], names=["cols3", "cols4"]
)
pdf2.columns = columns2
psdf2.columns = columns2
columns = pd.MultiIndex.from_tuples(
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
)
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name1", "name2"]
)
pdf = pd.DataFrame(data=data, index=index, columns=columns)
psdf = ps.from_pandas(pdf)
self.assert_eq(
pdf.reindex_like(pdf2).sort_index(),
psdf.reindex_like(psdf2).sort_index(),
)
class FrameReindexTests(
FrameReindexMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass
if __name__ == "__main__":
from pyspark.pandas.tests.indexes.test_reindex import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)