blob: ce5fab8e4b5f3d959815d3715c85a9416366ad2a [file]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
class FrameAnyAllMixin:
def test_all(self):
pdf = pd.DataFrame(
{
"col1": [False, False, False],
"col2": [True, False, False],
"col3": [0, 0, 1],
"col4": [0, 1, 2],
"col5": [False, False, None],
"col6": [True, False, None],
},
index=np.random.rand(3),
)
pdf.name = "x"
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.all(), pdf.all())
self.assert_eq(psdf.all(bool_only=True), pdf.all(bool_only=True))
self.assert_eq(psdf.all(bool_only=False), pdf.all(bool_only=False))
self.assert_eq(psdf[["col5"]].all(bool_only=True), pdf[["col5"]].all(bool_only=True))
self.assert_eq(psdf[["col5"]].all(bool_only=False), pdf[["col5"]].all(bool_only=False))
columns = pd.MultiIndex.from_tuples(
[
("a", "col1"),
("a", "col2"),
("a", "col3"),
("b", "col4"),
("b", "col5"),
("c", "col6"),
]
)
pdf.columns = columns
psdf.columns = columns
self.assert_eq(psdf.all(), pdf.all())
self.assert_eq(psdf.all(bool_only=True), pdf.all(bool_only=True))
self.assert_eq(psdf.all(bool_only=False), pdf.all(bool_only=False))
# Test axis=1
self.assert_eq(psdf.all(axis=1), pdf.all(axis=1))
self.assert_eq(psdf.all(axis=1, bool_only=True), pdf.all(axis=1, bool_only=True))
self.assert_eq(psdf.all(axis=1, bool_only=False), pdf.all(axis=1, bool_only=False))
# Test axis='index'
self.assert_eq(psdf.all(axis="index"), pdf.all(axis="index"))
self.assert_eq(
psdf.all(axis="index", bool_only=True), pdf.all(axis="index", bool_only=True)
)
self.assert_eq(
psdf.all(axis="index", bool_only=False), pdf.all(axis="index", bool_only=False)
)
# Test axis='columns'
self.assert_eq(psdf.all(axis="columns"), pdf.all(axis="columns"))
self.assert_eq(
psdf.all(axis="columns", bool_only=True), pdf.all(axis="columns", bool_only=True)
)
self.assert_eq(
psdf.all(axis="columns", bool_only=False), pdf.all(axis="columns", bool_only=False)
)
# Test axis=None
self.assert_eq(psdf.all(axis=None), pdf.all(axis=None))
self.assert_eq(psdf.all(axis=None, bool_only=True), pdf.all(axis=None, bool_only=True))
self.assert_eq(psdf.all(axis=None, bool_only=False), pdf.all(axis=None, bool_only=False))
columns.names = ["X", "Y"]
pdf.columns = columns
psdf.columns = columns
self.assert_eq(psdf.all(), pdf.all())
self.assert_eq(psdf.all(bool_only=True), pdf.all(bool_only=True))
self.assert_eq(psdf.all(bool_only=False), pdf.all(bool_only=False))
# Test axis=1
self.assert_eq(psdf.all(axis=1), pdf.all(axis=1))
self.assert_eq(psdf.all(axis=1, bool_only=True), pdf.all(axis=1, bool_only=True))
self.assert_eq(psdf.all(axis=1, bool_only=False), pdf.all(axis=1, bool_only=False))
# Test axis=None
self.assert_eq(psdf.all(axis=None), pdf.all(axis=None))
self.assert_eq(psdf.all(axis=None, bool_only=True), pdf.all(axis=None, bool_only=True))
self.assert_eq(psdf.all(axis=None, bool_only=False), pdf.all(axis=None, bool_only=False))
# Test skipna
pdf = pd.DataFrame({"A": [True, True], "B": [1, np.nan], "C": [True, None]})
pdf.name = "x"
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf[["A", "B"]].all(skipna=False), pdf[["A", "B"]].all(skipna=False))
self.assert_eq(
psdf[["A", "B"]].all(axis=1, skipna=False), pdf[["A", "B"]].all(axis=1, skipna=False)
)
self.assert_eq(psdf[["A", "C"]].all(skipna=False), pdf[["A", "C"]].all(skipna=False))
self.assert_eq(psdf[["B", "C"]].all(skipna=False), pdf[["B", "C"]].all(skipna=False))
self.assert_eq(psdf.all(skipna=False), pdf.all(skipna=False))
self.assert_eq(psdf.all(skipna=True), pdf.all(skipna=True))
self.assert_eq(psdf.all(), pdf.all())
self.assert_eq(
ps.DataFrame([np.nan]).all(skipna=False),
pd.DataFrame([np.nan]).all(skipna=False),
almost=True,
)
self.assert_eq(
ps.DataFrame([None]).all(skipna=True),
pd.DataFrame([None]).all(skipna=True),
almost=True,
)
def test_any(self):
pdf = pd.DataFrame(
{
"col1": [False, False, False],
"col2": [True, False, False],
"col3": [0, 0, 1],
"col4": [0, 1, 2],
"col5": [False, False, None],
"col6": [True, False, None],
},
index=np.random.rand(3),
)
pdf.name = "x"
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.any(), pdf.any())
self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True))
self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False))
self.assert_eq(psdf[["col5"]].all(bool_only=True), pdf[["col5"]].all(bool_only=True))
self.assert_eq(psdf[["col5"]].all(bool_only=False), pdf[["col5"]].all(bool_only=False))
columns = pd.MultiIndex.from_tuples(
[
("a", "col1"),
("a", "col2"),
("a", "col3"),
("b", "col4"),
("b", "col5"),
("c", "col6"),
]
)
pdf.columns = columns
psdf.columns = columns
self.assert_eq(psdf.any(), pdf.any())
self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True))
self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False))
# Test axis=1
self.assert_eq(psdf.any(axis=1), pdf.any(axis=1))
self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True))
self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False))
# Test axis='index'
self.assert_eq(psdf.any(axis="index"), pdf.any(axis="index"))
self.assert_eq(
psdf.any(axis="index", bool_only=True), pdf.any(axis="index", bool_only=True)
)
self.assert_eq(
psdf.any(axis="index", bool_only=False), pdf.any(axis="index", bool_only=False)
)
# Test axis='columns'
self.assert_eq(psdf.any(axis="columns"), pdf.any(axis="columns"))
self.assert_eq(
psdf.any(axis="columns", bool_only=True), pdf.any(axis="columns", bool_only=True)
)
self.assert_eq(
psdf.any(axis="columns", bool_only=False), pdf.any(axis="columns", bool_only=False)
)
# Test axis=None
self.assert_eq(psdf.any(axis=None), pdf.any(axis=None))
self.assert_eq(psdf.any(axis=None, bool_only=True), pdf.any(axis=None, bool_only=True))
self.assert_eq(psdf.any(axis=None, bool_only=False), pdf.any(axis=None, bool_only=False))
columns.names = ["X", "Y"]
pdf.columns = columns
psdf.columns = columns
self.assert_eq(psdf.any(), pdf.any())
self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True))
self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False))
# Test axis=1
self.assert_eq(psdf.any(axis=1), pdf.any(axis=1))
self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1, bool_only=True))
self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1, bool_only=False))
# Test axis=None
self.assert_eq(psdf.any(axis=None), pdf.any(axis=None))
self.assert_eq(psdf.any(axis=None, bool_only=True), pdf.any(axis=None, bool_only=True))
self.assert_eq(psdf.any(axis=None, bool_only=False), pdf.any(axis=None, bool_only=False))
# Test skipna parameter
pdf = pd.DataFrame(
{"A": [True, False], "B": [1, np.nan], "C": [True, None], "D": [None, np.nan]}
)
psdf = ps.from_pandas(pdf)
# bools and np.nan
self.assert_eq(psdf[["A", "B"]].any(skipna=False), pdf[["A", "B"]].any(skipna=False))
self.assert_eq(
psdf[["A", "B"]].any(axis=1, skipna=False), pdf[["A", "B"]].any(axis=1, skipna=False)
)
# bools and None
self.assert_eq(psdf[["A", "C"]].any(skipna=False), pdf[["A", "C"]].any(skipna=False))
# bools, np.nan, and None
self.assert_eq(psdf[["B", "C"]].any(skipna=False), pdf[["B", "C"]].any(skipna=False))
# np.nan, and None
self.assert_eq(psdf[["D"]].any(skipna=False), pdf[["D"]].any(skipna=False))
self.assert_eq(psdf[["D"]].any(axis=1, skipna=False), pdf[["D"]].any(axis=1, skipna=False))
# np.nan only
self.assert_eq(
ps.DataFrame([np.nan]).any(skipna=False),
pd.DataFrame([np.nan]).any(skipna=False),
almost=True,
)
self.assert_eq(
ps.DataFrame([np.nan]).any(axis=1, skipna=False),
pd.DataFrame([np.nan]).any(axis=1, skipna=False),
almost=True,
)
# None only
self.assert_eq(
ps.DataFrame([None]).any(skipna=True),
pd.DataFrame([None]).any(skipna=True),
almost=True,
)
self.assert_eq(
ps.DataFrame([None]).any(axis=1, skipna=True),
pd.DataFrame([None]).any(axis=1, skipna=True),
almost=True,
)
class FrameAnyAllTests(
FrameAnyAllMixin,
PandasOnSparkTestCase,
):
pass
if __name__ == "__main__":
from pyspark.testing import main
main()