blob: 69e97bfde6c5c92fe5dda74077b8295c38ffa93c [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class SeriesIndexMixin:
def test_align(self):
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
psdf = ps.from_pandas(pdf)
for join in ["outer", "inner", "left", "right"]:
for axis in [None, 0]:
psser_l, psser_r = psdf.a.align(psdf.b, join=join, axis=axis)
pser_l, pser_r = pdf.a.align(pdf.b, join=join, axis=axis)
self.assert_eq(psser_l, pser_l)
self.assert_eq(psser_r, pser_r)
psser_l, psdf_r = psdf.b.align(psdf[["b", "a"]], join=join, axis=axis)
pser_l, pdf_r = pdf.b.align(pdf[["b", "a"]], join=join, axis=axis)
self.assert_eq(psser_l, pser_l)
self.assert_eq(psdf_r, pdf_r)
self.assertRaises(ValueError, lambda: psdf.a.align(psdf.b, axis=1))
def test_reindex(self):
index = ["A", "B", "C", "D", "E"]
pser = pd.Series([1.0, 2.0, 3.0, 4.0, None], index=index, name="x")
psser = ps.from_pandas(pser)
self.assert_eq(pser, psser)
self.assert_eq(
pser.reindex(["A", "B"]).sort_index(),
psser.reindex(["A", "B"]).sort_index(),
)
self.assert_eq(
pser.reindex(["A", "B", "2", "3"]).sort_index(),
psser.reindex(["A", "B", "2", "3"]).sort_index(),
)
self.assert_eq(
pser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
psser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
)
self.assertRaises(TypeError, lambda: psser.reindex(index=123))
def test_reindex_like(self):
data = [1.0, 2.0, None]
index = pd.Index(["A", "B", "C"], name="index1")
pser = pd.Series(data=data, index=index, name="name1")
psser = ps.from_pandas(pser)
# Reindexing single Index on single Index
data2 = [3.0, None, 4.0]
index2 = pd.Index(["A", "C", "D"], name="index2")
pser2 = pd.Series(data=data2, index=index2, name="name2")
psser2 = ps.from_pandas(pser2)
self.assert_eq(
pser.reindex_like(pser2).sort_index(),
psser.reindex_like(psser2).sort_index(),
)
self.assert_eq(
(pser + 1).reindex_like(pser2).sort_index(),
(psser + 1).reindex_like(psser2).sort_index(),
)
# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"]
)
pser2 = pd.Series(data=data2, index=index2, name="name2")
psser2 = ps.from_pandas(pser2)
self.assert_eq(
pser.reindex_like(pser2).sort_index(),
psser.reindex_like(psser2).sort_index(),
)
self.assertRaises(TypeError, lambda: psser.reindex_like(index2))
self.assertRaises(AssertionError, lambda: psser2.reindex_like(psser))
# Reindexing MultiIndex on MultiIndex
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
)
pser = pd.Series(data=data, index=index, name="name1")
psser = ps.from_pandas(pser)
self.assert_eq(
pser.reindex_like(pser2).sort_index(),
psser.reindex_like(psser2).sort_index(),
)
# Reindexing with DataFrame
index2 = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"]
)
pdf = pd.DataFrame(data=data, index=index2)
psdf = ps.from_pandas(pdf)
self.assert_eq(
pser.reindex_like(pdf).sort_index(),
psser.reindex_like(psdf).sort_index(),
)
def test_reset_index(self):
pdf = pd.DataFrame({"foo": [1, 2, 3, 4]}, index=pd.Index(["a", "b", "c", "d"], name="idx"))
psdf = ps.from_pandas(pdf)
pser = pdf.foo
psser = psdf.foo
self.assert_eq(psser.reset_index(), pser.reset_index())
self.assert_eq(psser.reset_index(name="values"), pser.reset_index(name="values"))
self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True))
# inplace
psser.reset_index(drop=True, inplace=True)
pser.reset_index(drop=True, inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
def test_reset_index_with_default_index_types(self):
pser = pd.Series([1, 2, 3], name="0", index=np.random.rand(3))
psser = ps.from_pandas(pser)
with ps.option_context("compute.default_index_type", "sequence"):
self.assert_eq(psser.reset_index(), pser.reset_index())
with ps.option_context("compute.default_index_type", "distributed-sequence"):
# the order might be changed.
self.assert_eq(psser.reset_index().sort_index(), pser.reset_index())
with ps.option_context("compute.default_index_type", "distributed"):
# the index is different.
self.assert_eq(
psser.reset_index()._to_pandas().reset_index(drop=True), pser.reset_index()
)
def test_index_to_series_reset_index(self):
def check(psser, pser):
self.assert_eq(psser.reset_index(), pser.reset_index())
self.assert_eq(psser.reset_index(drop=True), pser.reset_index(drop=True))
pser.reset_index(drop=True, inplace=True)
psser.reset_index(drop=True, inplace=True)
self.assert_eq(psser, pser)
pdf = pd.DataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
index=np.random.rand(9),
)
psdf = ps.from_pandas(pdf)
check(psdf.index.to_series(), pdf.index.to_series())
check(psdf.index.to_series(name="a"), pdf.index.to_series(name="a"))
check(psdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a")))
def test_idxmax(self):
pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"])
psser = ps.Series(pser)
self.assertEqual(psser.idxmax(), pser.idxmax())
self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False))
index = pd.MultiIndex.from_arrays(
[["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second")
)
pser = pd.Series(data=[1, 2, 4, 5], index=index)
psser = ps.Series(pser)
self.assertEqual(psser.idxmax(), pser.idxmax())
self.assertEqual(psser.idxmax(skipna=False), pser.idxmax(skipna=False))
psser = ps.Series([])
with self.assertRaisesRegex(ValueError, "an empty sequence"):
psser.idxmax()
pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])
psser = ps.Series(pser)
self.assertEqual(psser.idxmax(), pser.idxmax())
self.assertEqual(repr(psser.idxmax(skipna=False)), repr(pser.idxmax(skipna=False)))
def test_idxmin(self):
pser = pd.Series(data=[1, 4, 5], index=["A", "B", "C"])
psser = ps.Series(pser)
self.assertEqual(psser.idxmin(), pser.idxmin())
self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False))
index = pd.MultiIndex.from_arrays(
[["a", "a", "b", "b"], ["c", "d", "e", "f"]], names=("first", "second")
)
pser = pd.Series(data=[1, 2, 4, 5], index=index)
psser = ps.Series(pser)
self.assertEqual(psser.idxmin(), pser.idxmin())
self.assertEqual(psser.idxmin(skipna=False), pser.idxmin(skipna=False))
psser = ps.Series([])
with self.assertRaisesRegex(ValueError, "an empty sequence"):
psser.idxmin()
pser = pd.Series([1, 100, None, 100, 1, 100], index=[10, 3, 5, 2, 1, 8])
psser = ps.Series(pser)
self.assertEqual(psser.idxmin(), pser.idxmin())
self.assertEqual(repr(psser.idxmin(skipna=False)), repr(pser.idxmin(skipna=False)))
def test_index(self):
# to check setting name of Index properly.
idx = pd.Index([1, 2, 3, 4, 5, 6, 7, 8, 9])
pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx)
psser = ps.from_pandas(pser)
psser.name = "koalas"
pser.name = "koalas"
self.assert_eq(psser.index.name, pser.index.name)
# for check setting names of MultiIndex properly.
psser.names = ["hello", "koalas"]
pser.names = ["hello", "koalas"]
self.assert_eq(psser.index.names, pser.index.names)
def test_axes(self):
pser = pd.Series([90, 91, 85], index=[2, 4, 1])
psser = ps.from_pandas(pser)
self.assert_eq(psser.axes, pser.axes)
# for MultiIndex
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
)
pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
psser = ps.from_pandas(pser)
self.assert_eq(psser.axes, pser.axes)
def test_swaplevel(self):
# MultiIndex with two levels
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
pser = pd.Series(["a", "b", "c", "d"], index=pidx)
psser = ps.from_pandas(pser)
self.assert_eq(pser.swaplevel(), psser.swaplevel())
self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1))
self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1))
self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color"))
# MultiIndex with more than two levels
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"], ["l", "m", "s", "xs"]]
pidx = pd.MultiIndex.from_arrays(arrays, names=("number", "color", "size"))
pser = pd.Series(["a", "b", "c", "d"], index=pidx)
psser = ps.from_pandas(pser)
self.assert_eq(pser.swaplevel(), psser.swaplevel())
self.assert_eq(pser.swaplevel(0, 1), psser.swaplevel(0, 1))
self.assert_eq(pser.swaplevel(0, 2), psser.swaplevel(0, 2))
self.assert_eq(pser.swaplevel(1, 2), psser.swaplevel(1, 2))
self.assert_eq(pser.swaplevel(1, 1), psser.swaplevel(1, 1))
self.assert_eq(pser.swaplevel(-1, -2), psser.swaplevel(-1, -2))
self.assert_eq(pser.swaplevel("number", "color"), psser.swaplevel("number", "color"))
self.assert_eq(pser.swaplevel("number", "size"), psser.swaplevel("number", "size"))
self.assert_eq(pser.swaplevel("color", "size"), psser.swaplevel("color", "size"))
# Error conditions
self.assertRaises(AssertionError, lambda: ps.Series([1, 2]).swaplevel())
self.assertRaises(IndexError, lambda: psser.swaplevel(0, 9))
self.assertRaises(KeyError, lambda: psser.swaplevel("not_number", "color"))
self.assertRaises(AssertionError, lambda: psser.swaplevel(copy=False))
def test_swapaxes(self):
pser = pd.Series([1, 2, 3], index=["x", "y", "z"], name="ser")
psser = ps.from_pandas(pser)
self.assert_eq(psser.swapaxes(0, 0), pser.swapaxes(0, 0))
self.assert_eq(psser.swapaxes("index", "index"), pser.swapaxes("index", "index"))
self.assert_eq((psser + 1).swapaxes(0, 0), (pser + 1).swapaxes(0, 0))
self.assertRaises(AssertionError, lambda: psser.swapaxes(0, 1, copy=False))
self.assertRaises(ValueError, lambda: psser.swapaxes(0, 1))
self.assertRaises(ValueError, lambda: psser.swapaxes("index", "columns"))
def test_droplevel(self):
pser = pd.Series(
[1, 2, 3],
index=pd.MultiIndex.from_tuples(
[("x", "a", "q"), ("x", "b", "w"), ("y", "c", "e")],
names=["level_1", "level_2", "level_3"],
),
)
psser = ps.from_pandas(pser)
self.assert_eq(pser.droplevel(0), psser.droplevel(0))
self.assert_eq(pser.droplevel("level_1"), psser.droplevel("level_1"))
self.assert_eq(pser.droplevel(-1), psser.droplevel(-1))
self.assert_eq(pser.droplevel([0]), psser.droplevel([0]))
self.assert_eq(pser.droplevel(["level_1"]), psser.droplevel(["level_1"]))
self.assert_eq(pser.droplevel((0,)), psser.droplevel((0,)))
self.assert_eq(pser.droplevel(("level_1",)), psser.droplevel(("level_1",)))
self.assert_eq(pser.droplevel([0, 2]), psser.droplevel([0, 2]))
self.assert_eq(
pser.droplevel(["level_1", "level_3"]), psser.droplevel(["level_1", "level_3"])
)
self.assert_eq(pser.droplevel((1, 2)), psser.droplevel((1, 2)))
self.assert_eq(
pser.droplevel(("level_2", "level_3")), psser.droplevel(("level_2", "level_3"))
)
with self.assertRaisesRegex(KeyError, "Level {0, 1, 2} not found"):
psser.droplevel({0, 1, 2})
with self.assertRaisesRegex(KeyError, "Level level_100 not found"):
psser.droplevel(["level_1", "level_100"])
with self.assertRaisesRegex(IndexError, "Too many levels: Index has only 3 levels, not 11"):
psser.droplevel(10)
with self.assertRaisesRegex(
IndexError,
"Too many levels: Index has only 3 levels, -10 is not a valid level number",
):
psser.droplevel(-10)
with self.assertRaisesRegex(
ValueError,
"Cannot remove 3 levels from an index with 3 levels: "
"at least one level must be left.",
):
psser.droplevel([0, 1, 2])
with self.assertRaisesRegex(
ValueError,
"Cannot remove 5 levels from an index with 3 levels: "
"at least one level must be left.",
):
psser.droplevel([1, 1, 1, 1, 1])
# Tupled names
pser.index.names = [("a", "1"), ("b", "2"), ("c", "3")]
psser = ps.from_pandas(pser)
self.assert_eq(
pser.droplevel([("a", "1"), ("c", "3")]), psser.droplevel([("a", "1"), ("c", "3")])
)
def test_last_valid_index(self):
pser = pd.Series([250, 1.5, 320, 1, 0.3, None, None, None, None])
psser = ps.from_pandas(pser)
self.assert_eq(pser.last_valid_index(), psser.last_valid_index())
# MultiIndex columns
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
)
pser.index = midx
psser = ps.from_pandas(pser)
self.assert_eq(pser.last_valid_index(), psser.last_valid_index())
# Empty Series
pser = pd.Series([])
psser = ps.from_pandas(pser)
self.assert_eq(pser.last_valid_index(), psser.last_valid_index())
def test_first_valid_index(self):
# Empty Series
pser = pd.Series([])
psser = ps.from_pandas(pser)
self.assert_eq(pser.first_valid_index(), psser.first_valid_index())
def test_rename_axis(self):
index = pd.Index(["A", "B", "C"], name="index")
pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0]}, index=index)
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
self.assert_eq(
pser.rename_axis("index2").sort_index(),
psser.rename_axis("index2").sort_index(),
)
self.assert_eq(
(pser + 1).rename_axis("index2").sort_index(),
(psser + 1).rename_axis("index2").sort_index(),
)
self.assertRaises(ValueError, lambda: psser.rename_axis(["index2", "index3"]))
self.assertRaises(TypeError, lambda: psser.rename_axis(mapper=["index2"], index=["index3"]))
self.assert_eq(
pser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(),
psser.rename_axis(index={"index": "index2", "missing": "index4"}).sort_index(),
)
self.assert_eq(
pser.rename_axis(index=str.upper).sort_index(),
psser.rename_axis(index=str.upper).sort_index(),
)
pser.rename_axis("index2", inplace=True)
psser.rename_axis("index2", inplace=True)
self.assert_eq(pser.sort_index(), psser.sort_index())
# Note: in pandas, pdf.x's index is renamed, whereas pdf's index isn't due to a bug.
self.assert_eq(pdf, psdf)
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
)
pser = pd.Series([1.0, 2.0, 3.0], index=index, name="name")
psser = ps.from_pandas(pser)
self.assert_eq(
pser.rename_axis(["index3", "index4"]).sort_index(),
psser.rename_axis(["index3", "index4"]).sort_index(),
)
self.assertRaises(ValueError, lambda: psser.rename_axis(["index3", "index4", "index5"]))
self.assert_eq(
pser.rename_axis(
index={"index1": "index3", "index2": "index4", "missing": "index5"}
).sort_index(),
psser.rename_axis(
index={"index1": "index3", "index2": "index4", "missing": "index5"}
).sort_index(),
)
self.assert_eq(
pser.rename_axis(index=str.upper).sort_index(),
psser.rename_axis(index=str.upper).sort_index(),
)
class SeriesIndexTests(
SeriesIndexMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass
if __name__ == "__main__":
from pyspark.pandas.tests.series.test_index import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)