blob: 7336ef23cc6117c59a0d7d9472a4bec325caf49f [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class SeriesMissingDataMixin:
def test_fillna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser = pdf.x
psser = psdf.x
self.assert_eq(psser.fillna(0), pser.fillna(0))
self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0))
psser.fillna(0, inplace=True)
pser.fillna(0, inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
# test considering series does not have NA/NaN values
psser.fillna(0, inplace=True)
pser.fillna(0, inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
psser = psdf.x.rename("y")
pser = pdf.x.rename("y")
psser.fillna(0, inplace=True)
pser.fillna(0, inplace=True)
self.assert_eq(psser.head(), pser.head())
self.assert_eq(psdf, pdf)
pser = pd.Series([1, 2, 3, 4, 5, 6], name="x")
psser = ps.from_pandas(pser)
pser.loc[3] = np.nan
psser.loc[3] = np.nan
self.assert_eq(psser.fillna(0), pser.fillna(0))
self.assert_eq(psser.fillna(method="ffill"), pser.fillna(method="ffill"))
self.assert_eq(psser.fillna(method="bfill"), pser.fillna(method="bfill"))
self.assert_eq(psser.fillna(method="backfill"), pser.fillna(method="backfill"))
# inplace fillna on non-nullable column
pdf = pd.DataFrame({"a": [1, 2, None], "b": [1, 2, 3]})
psdf = ps.from_pandas(pdf)
pser = pdf.b
psser = psdf.b
self.assert_eq(psser.fillna(0), pser.fillna(0))
self.assert_eq(psser.fillna(np.nan).fillna(0), pser.fillna(np.nan).fillna(0))
psser.fillna(0, inplace=True)
pser.fillna(0, inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
with self.assertRaisesRegex(
ValueError, "Must specify a fillna 'value' or 'method' parameter."
):
psser.fillna()
with self.assertRaisesRegex(TypeError, "Unsupported type list"):
psdf.a.fillna([0])
with self.assertRaisesRegex(
NotImplementedError, "fillna currently only works for axis=0 or axis='index'"
):
psdf.a.fillna(0, axis=1)
with self.assertRaisesRegex(
NotImplementedError, "limit parameter for value is not support now"
):
psdf.a.fillna(0, limit=1)
def test_dropna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser = pdf.x
psser = psdf.x
self.assert_eq(psser.dropna(), pser.dropna())
pser.dropna(inplace=True)
psser.dropna(inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)
def test_add_and_radd_fill_value(self):
pser = pd.Series([1, 2, None, 4])
psser = ps.from_pandas(pser)
values = [-10, -0.5, 0, None, 0.5, 10]
for value in values:
self.assert_eq(pser.add(10, fill_value=value), psser.add(10, fill_value=value))
self.assert_eq(pser.radd(10, fill_value=value), psser.radd(10, fill_value=value))
self.assertRaises(NotImplementedError, lambda: psser.add([1, 2, None, None], fill_value=10))
self.assertRaises(
NotImplementedError, lambda: psser.radd([1, 2, None, None], fill_value=10)
)
def test_replace(self):
pser = pd.Series([10, 20, 15, 30, np.nan], name="x")
psser = ps.from_pandas(pser)
self.assert_eq(psser.replace(), pser.replace())
self.assert_eq(psser.replace({}), pser.replace({}))
self.assert_eq(psser.replace(np.nan, 45), pser.replace(np.nan, 45))
self.assert_eq(psser.replace([10, 15], 45), pser.replace([10, 15], 45))
self.assert_eq(psser.replace((10, 15), 45), pser.replace((10, 15), 45))
self.assert_eq(psser.replace([10, 15], [45, 50]), pser.replace([10, 15], [45, 50]))
self.assert_eq(psser.replace((10, 15), (45, 50)), pser.replace((10, 15), (45, 50)))
pser = pd.Series(["bat", "foo", "bait", "abc", "bar", "zoo"])
psser = ps.from_pandas(pser)
self.assert_eq(
psser.replace(to_replace=r"^ba.$", value="new", regex=True),
pser.replace(to_replace=r"^ba.$", value="new", regex=True),
)
self.assert_eq(
psser.replace(regex=r"^.oo$", value="new"), pser.replace(regex=r"^.oo$", value="new")
)
self.assert_eq(
(psser + "o").replace(regex=r"^.ooo$", value="new"),
(pser + "o").replace(regex=r"^.ooo$", value="new"),
)
msg = "'to_replace' should be one of str, list, tuple, dict, int, float"
with self.assertRaisesRegex(TypeError, msg):
psser.replace(ps.range(5))
msg = "Replacement lists must match in length. Expecting 3 got 2"
with self.assertRaisesRegex(ValueError, msg):
psser.replace(["bat", "foo", "bait"], ["a", "b"])
msg = "'to_replace' must be 'None' if 'regex' is not a bool"
with self.assertRaisesRegex(ValueError, msg):
psser.replace(to_replace="foo", regex=r"^.oo$")
msg = "If 'regex' is True then 'to_replace' must be a string"
with self.assertRaisesRegex(AssertionError, msg):
psser.replace(["bat", "foo", "bait"], regex=True)
unsupported_regex = [r"^.oo$", r"^ba.$"]
msg = "'regex' of %s type is not supported" % type(unsupported_regex).__name__
with self.assertRaisesRegex(NotImplementedError, msg):
psser.replace(regex=unsupported_regex, value="new")
def test_bfill(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser = pdf.x
psser = psdf.x
self.assert_eq(psser.bfill(), pser.bfill())
self.assert_eq(psser.bfill()[0], pser.bfill()[0])
psser.bfill(inplace=True)
pser.bfill(inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psser[0], pser[0])
self.assert_eq(psdf, pdf)
def test_ffill(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser = pdf.x
psser = psdf.x
self.assert_eq(psser.ffill(), pser.ffill())
self.assert_eq(psser.ffill()[4], pser.ffill()[4])
psser.ffill(inplace=True)
pser.ffill(inplace=True)
self.assert_eq(psser, pser)
self.assert_eq(psser[4], pser[4])
self.assert_eq(psdf, pdf)
def test_pad(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
self.assert_eq(pser.pad(), psser.pad())
# Test `inplace=True`
pser.pad(inplace=True)
psser.pad(inplace=True)
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
def test_backfill(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6]})
psdf = ps.from_pandas(pdf)
pser, psser = pdf.x, psdf.x
self.assert_eq(pser.backfill(), psser.backfill())
# Test `inplace=True`
pser.backfill(inplace=True)
psser.backfill(inplace=True)
self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)
class SeriesMissingDataTests(
SeriesMissingDataMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass
if __name__ == "__main__":
from pyspark.pandas.tests.series.test_missing_data import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)