python/pyspark/pandas/tests/computation/test_apply_func.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from datetime import datetime
 import sys
 import unittest
 from typing import List

 import numpy as np
 import pandas as pd

 from pyspark import pandas as ps
 from pyspark.loose_version import LooseVersion
 from pyspark.pandas.config import option_context
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils


 # This file contains test cases for 'Function application, GroupBy & Window'
 # https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/frame.html#function-application-groupby-window
 # as well as 'apply_batch*' and 'transform_batch*'.
 class FrameApplyFunctionMixin:
     @property
     def pdf(self):
         return pd.DataFrame(
             {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
             index=np.random.rand(9),
         )

     @property
     def psdf(self):
         return ps.from_pandas(self.pdf)

     def test_apply(self):
         pdf = pd.DataFrame(
             {
                 "a": [1, 2, 3, 4, 5, 6] * 100,
                 "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
                 "c": [1, 4, 9, 16, 25, 36] * 100,
             },
             columns=["a", "b", "c"],
             index=np.random.rand(600),
         )
         psdf = ps.DataFrame(pdf)

         self.assert_eq(
             psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
         )
         self.assert_eq(
             psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
             pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
         )
         self.assert_eq(
             psdf.apply(lambda x, b: x + b, b=1).sort_index(),
             pdf.apply(lambda x, b: x + b, b=1).sort_index(),
         )

         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
             )
             self.assert_eq(
                 psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
                 pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
             )
             self.assert_eq(
                 psdf.apply(lambda x, b: x + b, b=1).sort_index(),
                 pdf.apply(lambda x, b: x + b, b=1).sort_index(),
             )

         # returning a Series
         self.assert_eq(
             psdf.apply(lambda x: len(x), axis=1).sort_index(),
             pdf.apply(lambda x: len(x), axis=1).sort_index(),
         )
         self.assert_eq(
             psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
             pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.apply(lambda x: len(x), axis=1).sort_index(),
                 pdf.apply(lambda x: len(x), axis=1).sort_index(),
             )
             self.assert_eq(
                 psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
                 pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
             )

         with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
             psdf.apply(1)

         with self.assertRaisesRegex(TypeError, "The given function.*1 or 'column'; however"):

             def f1(_) -> ps.DataFrame[int]:
                 pass

             psdf.apply(f1, axis=0)

         with self.assertRaisesRegex(TypeError, "The given function.*0 or 'index'; however"):

             def f2(_) -> ps.Series[int]:
                 pass

             psdf.apply(f2, axis=1)

         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
             )

         # returning a Series
         self.assert_eq(
             psdf.apply(lambda x: len(x), axis=1).sort_index(),
             pdf.apply(lambda x: len(x), axis=1).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.apply(lambda x: len(x), axis=1).sort_index(),
                 pdf.apply(lambda x: len(x), axis=1).sort_index(),
             )

     def test_apply_with_type(self):
         pdf = self.pdf
         psdf = ps.from_pandas(pdf)

         def identify1(x) -> ps.DataFrame[int, int]:
             return x

         # Type hints set the default column names, and we use default index for
         # pandas API on Spark. Here we ignore both diff.
         actual = psdf.apply(identify1, axis=1)
         expected = pdf.apply(identify1, axis=1)
         self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

         def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]:  # noqa: F405
             return x

         actual = psdf.apply(identify2, axis=1)
         expected = pdf.apply(identify2, axis=1)
         self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

     def test_apply_batch(self):
         pdf = pd.DataFrame(
             {
                 "a": [1, 2, 3, 4, 5, 6] * 100,
                 "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
                 "c": [1, 4, 9, 16, 25, 36] * 100,
             },
             columns=["a", "b", "c"],
             index=np.random.rand(600),
         )
         psdf = ps.DataFrame(pdf)

         self.assert_eq(
             psdf.pandas_on_spark.apply_batch(lambda pdf, a: pdf + a, args=(1,)).sort_index(),
             (pdf + 1).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.pandas_on_spark.apply_batch(lambda pdf: pdf + 1).sort_index(),
                 (pdf + 1).sort_index(),
             )
             self.assert_eq(
                 psdf.pandas_on_spark.apply_batch(lambda pdf, b: pdf + b, b=1).sort_index(),
                 (pdf + 1).sort_index(),
             )

         with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
             psdf.pandas_on_spark.apply_batch(1)

         with self.assertRaisesRegex(TypeError, "The given function.*frame as its type hints"):

             def f2(_) -> ps.Series[int]:
                 pass

             psdf.pandas_on_spark.apply_batch(f2)

         with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
             psdf.pandas_on_spark.apply_batch(lambda pdf: 1)

         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.pandas_on_spark.apply_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.pandas_on_spark.apply_batch(lambda x: x + 1).sort_index(),
                 (pdf + 1).sort_index(),
             )

     def test_apply_batch_with_type(self):
         pdf = self.pdf
         psdf = ps.from_pandas(pdf)

         def identify1(x) -> ps.DataFrame[int, int]:
             return x

         # Type hints set the default column names, and we use default index for
         # pandas API on Spark. Here we ignore both diff.
         actual = psdf.pandas_on_spark.apply_batch(identify1)
         expected = pdf
         self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

         def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]:  # noqa: F405
             return x

         actual = psdf.pandas_on_spark.apply_batch(identify2)
         expected = pdf
         self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

         pdf = pd.DataFrame(
             {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]]},
             index=np.random.rand(9),
         )
         psdf = ps.from_pandas(pdf)

         def identify3(x) -> ps.DataFrame[float, [int, List[int]]]:
             return x

         actual = psdf.pandas_on_spark.apply_batch(identify3)
         actual.columns = ["a", "b"]
         self.assert_eq(actual, pdf)

         # For NumPy typing, NumPy version should be 1.21+
         if LooseVersion(np.__version__) >= LooseVersion("1.21"):
             import numpy.typing as ntp

             psdf = ps.from_pandas(pdf)

             def identify4(
                 x,
             ) -> ps.DataFrame[float, [int, ntp.NDArray[int]]]:
                 return x

             actual = psdf.pandas_on_spark.apply_batch(identify4)
             actual.columns = ["a", "b"]
             self.assert_eq(actual, pdf)

         arrays = [[1, 2, 3, 4, 5, 6, 7, 8, 9], ["a", "b", "c", "d", "e", "f", "g", "h", "i"]]
         idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
         pdf = pd.DataFrame(
             {"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]]},
             index=idx,
         )
         psdf = ps.from_pandas(pdf)

         def identify4(x) -> ps.DataFrame[[int, str], [int, List[int]]]:
             return x

         actual = psdf.pandas_on_spark.apply_batch(identify4)
         actual.index.names = ["number", "color"]
         actual.columns = ["a", "b"]
         self.assert_eq(actual, pdf)

         def identify5(
             x,
         ) -> ps.DataFrame[
             [("number", int), ("color", str)], [("a", int), ("b", List[int])]  # noqa: F405
         ]:
             return x

         actual = psdf.pandas_on_spark.apply_batch(identify5)
         self.assert_eq(actual, pdf)

     def test_transform(self):
         pdf = pd.DataFrame(
             {
                 "a": [1, 2, 3, 4, 5, 6] * 100,
                 "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
                 "c": [1, 4, 9, 16, 25, 36] * 100,
             },
             columns=["a", "b", "c"],
             index=np.random.rand(600),
         )
         psdf = ps.DataFrame(pdf)
         self.assert_eq(
             psdf.transform(lambda x: x + 1).sort_index(),
             pdf.transform(lambda x: x + 1).sort_index(),
         )
         self.assert_eq(
             psdf.transform(lambda x, y: x + y, y=2).sort_index(),
             pdf.transform(lambda x, y: x + y, y=2).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.transform(lambda x: x + 1).sort_index(),
                 pdf.transform(lambda x: x + 1).sort_index(),
             )
             self.assert_eq(
                 psdf.transform(lambda x, y: x + y, y=1).sort_index(),
                 pdf.transform(lambda x, y: x + y, y=1).sort_index(),
             )

         with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
             psdf.transform(1)
         with self.assertRaisesRegex(
             NotImplementedError, 'axis should be either 0 or "index" currently.'
         ):
             psdf.transform(lambda x: x + 1, axis=1)

         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.transform(lambda x: x + 1).sort_index(),
             pdf.transform(lambda x: x + 1).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.transform(lambda x: x + 1).sort_index(),
                 pdf.transform(lambda x: x + 1).sort_index(),
             )

     def test_transform_batch(self):
         pdf = pd.DataFrame(
             {
                 "a": [1, 2, 3, 4, 5, 6] * 100,
                 "b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
                 "c": [1, 4, 9, 16, 25, 36] * 100,
             },
             columns=["a", "b", "c"],
             index=np.random.rand(600),
         )
         psdf = ps.DataFrame(pdf)

         self.assert_eq(
             psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.c + 1).sort_index(),
             (pdf.c + 1).sort_index(),
         )
         self.assert_eq(
             psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
             (pdf + 1).sort_index(),
         )
         self.assert_eq(
             psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
             (pdf.c + 1).sort_index(),
         )

         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.pandas_on_spark.transform_batch(lambda pdf: pdf + 1).sort_index(),
                 (pdf + 1).sort_index(),
             )
             self.assert_eq(
                 psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.b + 1).sort_index(),
                 (pdf.b + 1).sort_index(),
             )
             self.assert_eq(
                 psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
                 (pdf + 1).sort_index(),
             )
             self.assert_eq(
                 psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
                 (pdf.c + 1).sort_index(),
             )

         with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
             psdf.pandas_on_spark.transform_batch(1)

         with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
             psdf.pandas_on_spark.transform_batch(lambda pdf: 1)

         with self.assertRaisesRegex(
             ValueError, "transform_batch cannot produce aggregated results"
         ):
             psdf.pandas_on_spark.transform_batch(lambda pdf: pd.Series(1))

         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.pandas_on_spark.transform_batch(lambda x: x + 1).sort_index(),
             (pdf + 1).sort_index(),
         )
         with option_context("compute.shortcut_limit", 500):
             self.assert_eq(
                 psdf.pandas_on_spark.transform_batch(lambda x: x + 1).sort_index(),
                 (pdf + 1).sort_index(),
             )

     def test_transform_batch_with_type(self):
         pdf = self.pdf
         psdf = ps.from_pandas(pdf)

         def identify1(x) -> ps.DataFrame[int, int]:
             return x

         # Type hints set the default column names, and we use default index for
         # pandas API on Spark. Here we ignore both diff.
         actual = psdf.pandas_on_spark.transform_batch(identify1)
         expected = pdf
         self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

         def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]:  # noqa: F405
             return x

         actual = psdf.pandas_on_spark.transform_batch(identify2)
         expected = pdf
         self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
         self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

     def test_transform_batch_same_anchor(self):
         psdf = ps.range(10)
         psdf["d"] = psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.id + 1)
         self.assert_eq(
             psdf,
             pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
         )

         psdf = ps.range(10)

         def plus_one(pdf) -> ps.Series[np.int64]:
             return pdf.id + 1

         psdf["d"] = psdf.pandas_on_spark.transform_batch(plus_one)
         self.assert_eq(
             psdf,
             pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
         )

         psdf = ps.range(10)

         def plus_one(ser) -> ps.Series[np.int64]:
             return ser + 1

         psdf["d"] = psdf.id.pandas_on_spark.transform_batch(plus_one)
         self.assert_eq(
             psdf,
             pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
         )

     def test_pipe(self):
         psdf = ps.DataFrame(
             {"category": ["A", "A", "B"], "col1": [1, 2, 3], "col2": [4, 5, 6]},
             columns=["category", "col1", "col2"],
         )

         self.assertRaisesRegex(
             ValueError,
             "arg is both the pipe target and a keyword argument",
             lambda: psdf.pipe((lambda x: x, "arg"), arg="1"),
         )

     def test_aggregate(self):
         pdf = pd.DataFrame(
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=["A", "B", "C"]
         )
         psdf = ps.from_pandas(pdf)

         self.assert_eq(
             psdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(),  # TODO?: fix column order
             pdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(),
         )
         self.assert_eq(
             psdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
             pdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
         )

         self.assertRaises(KeyError, lambda: psdf.agg({"A": ["sum", "min"], "X": ["min", "max"]}))

         # multi-index columns
         columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
             pdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
         )
         self.assert_eq(
             psdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
                 [("X", "A"), ("X", "B")]
             ].sort_index(),
             pdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
                 [("X", "A"), ("X", "B")]
             ].sort_index(),
         )

         self.assertRaises(TypeError, lambda: psdf.agg({"X": ["sum", "min"], "Y": ["min", "max"]}))

         # non-string names
         pdf = pd.DataFrame(
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=[10, 20, 30]
         )
         psdf = ps.from_pandas(pdf)

         self.assert_eq(
             psdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
             pdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
         )
         self.assert_eq(
             psdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
             pdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
         )

         columns = pd.MultiIndex.from_tuples([("X", 10), ("X", 20), ("Y", 30)])
         pdf.columns = columns
         psdf.columns = columns

         self.assert_eq(
             psdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
             pdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
         )
         self.assert_eq(
             psdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
                 [("X", 10), ("X", 20)]
             ].sort_index(),
             pdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
                 [("X", 10), ("X", 20)]
             ].sort_index(),
         )

         pdf = pd.DataFrame(
             [datetime(2019, 2, 2, 0, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0, 0)],
             columns=["timestamp"],
         )
         psdf = ps.from_pandas(pdf)

         self.assert_eq(psdf.timestamp.min(), pdf.timestamp.min())
         self.assert_eq(psdf.timestamp.max(), pdf.timestamp.max())

         self.assertRaises(ValueError, lambda: psdf.agg(("sum", "min")))


 class FrameApplyFunctionTests(
     FrameApplyFunctionMixin,
     PandasOnSparkTestCase,
     SQLTestUtils,
 ):
     pass


 if __name__ == "__main__":
     from pyspark.pandas.tests.computation.test_apply_func import *  # noqa: F401

     try:
         import xmlrunner

         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:
         testRunner = None
     unittest.main(testRunner=testRunner, verbosity=2)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	from datetime import datetime
	import sys
	import unittest
	from typing import List

	import numpy as np
	import pandas as pd

	from pyspark import pandas as ps
	from pyspark.loose_version import LooseVersion
	from pyspark.pandas.config import option_context
	from pyspark.testing.pandasutils import PandasOnSparkTestCase
	from pyspark.testing.sqlutils import SQLTestUtils


	# This file contains test cases for 'Function application, GroupBy & Window'
	# https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/frame.html#function-application-groupby-window
	# as well as 'apply_batch' and 'transform_batch'.
	class FrameApplyFunctionMixin:
	@property
	def pdf(self):
	return pd.DataFrame(
	{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [4, 5, 6, 3, 2, 1, 0, 0, 0]},
	index=np.random.rand(9),
	)

	@property
	def psdf(self):
	return ps.from_pandas(self.pdf)

	def test_apply(self):
	pdf = pd.DataFrame(
	{
	"a": [1, 2, 3, 4, 5, 6] * 100,
	"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
	"c": [1, 4, 9, 16, 25, 36] * 100,
	},
	columns=["a", "b", "c"],
	index=np.random.rand(600),
	)
	psdf = ps.DataFrame(pdf)

	self.assert_eq(
	psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
	)
	self.assert_eq(
	psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
	pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
	)
	self.assert_eq(
	psdf.apply(lambda x, b: x + b, b=1).sort_index(),
	pdf.apply(lambda x, b: x + b, b=1).sort_index(),
	)

	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
	)
	self.assert_eq(
	psdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
	pdf.apply(lambda x, b: x + b, args=(1,)).sort_index(),
	)
	self.assert_eq(
	psdf.apply(lambda x, b: x + b, b=1).sort_index(),
	pdf.apply(lambda x, b: x + b, b=1).sort_index(),
	)

	# returning a Series
	self.assert_eq(
	psdf.apply(lambda x: len(x), axis=1).sort_index(),
	pdf.apply(lambda x: len(x), axis=1).sort_index(),
	)
	self.assert_eq(
	psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
	pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.apply(lambda x: len(x), axis=1).sort_index(),
	pdf.apply(lambda x: len(x), axis=1).sort_index(),
	)
	self.assert_eq(
	psdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
	pdf.apply(lambda x, c: len(x) + c, axis=1, c=100).sort_index(),
	)

	with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
	psdf.apply(1)

	with self.assertRaisesRegex(TypeError, "The given function.*1 or 'column'; however"):

	def f1(_) -> ps.DataFrame[int]:
	pass

	psdf.apply(f1, axis=0)

	with self.assertRaisesRegex(TypeError, "The given function.*0 or 'index'; however"):

	def f2(_) -> ps.Series[int]:
	pass

	psdf.apply(f2, axis=1)

	# multi-index columns
	columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.apply(lambda x: x + 1).sort_index(), pdf.apply(lambda x: x + 1).sort_index()
	)

	# returning a Series
	self.assert_eq(
	psdf.apply(lambda x: len(x), axis=1).sort_index(),
	pdf.apply(lambda x: len(x), axis=1).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.apply(lambda x: len(x), axis=1).sort_index(),
	pdf.apply(lambda x: len(x), axis=1).sort_index(),
	)

	def test_apply_with_type(self):
	pdf = self.pdf
	psdf = ps.from_pandas(pdf)

	def identify1(x) -> ps.DataFrame[int, int]:
	return x

	# Type hints set the default column names, and we use default index for
	# pandas API on Spark. Here we ignore both diff.
	actual = psdf.apply(identify1, axis=1)
	expected = pdf.apply(identify1, axis=1)
	self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

	def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]: # noqa: F405
	return x

	actual = psdf.apply(identify2, axis=1)
	expected = pdf.apply(identify2, axis=1)
	self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

	def test_apply_batch(self):
	pdf = pd.DataFrame(
	{
	"a": [1, 2, 3, 4, 5, 6] * 100,
	"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
	"c": [1, 4, 9, 16, 25, 36] * 100,
	},
	columns=["a", "b", "c"],
	index=np.random.rand(600),
	)
	psdf = ps.DataFrame(pdf)

	self.assert_eq(
	psdf.pandas_on_spark.apply_batch(lambda pdf, a: pdf + a, args=(1,)).sort_index(),
	(pdf + 1).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.pandas_on_spark.apply_batch(lambda pdf: pdf + 1).sort_index(),
	(pdf + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.apply_batch(lambda pdf, b: pdf + b, b=1).sort_index(),
	(pdf + 1).sort_index(),
	)

	with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
	psdf.pandas_on_spark.apply_batch(1)

	with self.assertRaisesRegex(TypeError, "The given function.*frame as its type hints"):

	def f2(_) -> ps.Series[int]:
	pass

	psdf.pandas_on_spark.apply_batch(f2)

	with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
	psdf.pandas_on_spark.apply_batch(lambda pdf: 1)

	# multi-index columns
	columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.pandas_on_spark.apply_batch(lambda x: x + 1).sort_index(), (pdf + 1).sort_index()
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.pandas_on_spark.apply_batch(lambda x: x + 1).sort_index(),
	(pdf + 1).sort_index(),
	)

	def test_apply_batch_with_type(self):
	pdf = self.pdf
	psdf = ps.from_pandas(pdf)

	def identify1(x) -> ps.DataFrame[int, int]:
	return x

	# Type hints set the default column names, and we use default index for
	# pandas API on Spark. Here we ignore both diff.
	actual = psdf.pandas_on_spark.apply_batch(identify1)
	expected = pdf
	self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

	def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]: # noqa: F405
	return x

	actual = psdf.pandas_on_spark.apply_batch(identify2)
	expected = pdf
	self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

	pdf = pd.DataFrame(
	{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]]},
	index=np.random.rand(9),
	)
	psdf = ps.from_pandas(pdf)

	def identify3(x) -> ps.DataFrame[float, [int, List[int]]]:
	return x

	actual = psdf.pandas_on_spark.apply_batch(identify3)
	actual.columns = ["a", "b"]
	self.assert_eq(actual, pdf)

	# For NumPy typing, NumPy version should be 1.21+
	if LooseVersion(np.__version__) >= LooseVersion("1.21"):
	import numpy.typing as ntp

	psdf = ps.from_pandas(pdf)

	def identify4(
	x,
	) -> ps.DataFrame[float, [int, ntp.NDArray[int]]]:
	return x

	actual = psdf.pandas_on_spark.apply_batch(identify4)
	actual.columns = ["a", "b"]
	self.assert_eq(actual, pdf)

	arrays = [[1, 2, 3, 4, 5, 6, 7, 8, 9], ["a", "b", "c", "d", "e", "f", "g", "h", "i"]]
	idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color"))
	pdf = pd.DataFrame(
	{"a": [1, 2, 3, 4, 5, 6, 7, 8, 9], "b": [[e] for e in [4, 5, 6, 3, 2, 1, 0, 0, 0]]},
	index=idx,
	)
	psdf = ps.from_pandas(pdf)

	def identify4(x) -> ps.DataFrame[[int, str], [int, List[int]]]:
	return x

	actual = psdf.pandas_on_spark.apply_batch(identify4)
	actual.index.names = ["number", "color"]
	actual.columns = ["a", "b"]
	self.assert_eq(actual, pdf)

	def identify5(
	x,
	) -> ps.DataFrame[
	[("number", int), ("color", str)], [("a", int), ("b", List[int])] # noqa: F405
	]:
	return x

	actual = psdf.pandas_on_spark.apply_batch(identify5)
	self.assert_eq(actual, pdf)

	def test_transform(self):
	pdf = pd.DataFrame(
	{
	"a": [1, 2, 3, 4, 5, 6] * 100,
	"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
	"c": [1, 4, 9, 16, 25, 36] * 100,
	},
	columns=["a", "b", "c"],
	index=np.random.rand(600),
	)
	psdf = ps.DataFrame(pdf)
	self.assert_eq(
	psdf.transform(lambda x: x + 1).sort_index(),
	pdf.transform(lambda x: x + 1).sort_index(),
	)
	self.assert_eq(
	psdf.transform(lambda x, y: x + y, y=2).sort_index(),
	pdf.transform(lambda x, y: x + y, y=2).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.transform(lambda x: x + 1).sort_index(),
	pdf.transform(lambda x: x + 1).sort_index(),
	)
	self.assert_eq(
	psdf.transform(lambda x, y: x + y, y=1).sort_index(),
	pdf.transform(lambda x, y: x + y, y=1).sort_index(),
	)

	with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
	psdf.transform(1)
	with self.assertRaisesRegex(
	NotImplementedError, 'axis should be either 0 or "index" currently.'
	):
	psdf.transform(lambda x: x + 1, axis=1)

	# multi-index columns
	columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.transform(lambda x: x + 1).sort_index(),
	pdf.transform(lambda x: x + 1).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.transform(lambda x: x + 1).sort_index(),
	pdf.transform(lambda x: x + 1).sort_index(),
	)

	def test_transform_batch(self):
	pdf = pd.DataFrame(
	{
	"a": [1, 2, 3, 4, 5, 6] * 100,
	"b": [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] * 100,
	"c": [1, 4, 9, 16, 25, 36] * 100,
	},
	columns=["a", "b", "c"],
	index=np.random.rand(600),
	)
	psdf = ps.DataFrame(pdf)

	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.c + 1).sort_index(),
	(pdf.c + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
	(pdf + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
	(pdf.c + 1).sort_index(),
	)

	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf: pdf + 1).sort_index(),
	(pdf + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.b + 1).sort_index(),
	(pdf.b + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf + a, 1).sort_index(),
	(pdf + 1).sort_index(),
	)
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda pdf, a: pdf.c + a, a=1).sort_index(),
	(pdf.c + 1).sort_index(),
	)

	with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
	psdf.pandas_on_spark.transform_batch(1)

	with self.assertRaisesRegex(ValueError, "The given function should return a frame"):
	psdf.pandas_on_spark.transform_batch(lambda pdf: 1)

	with self.assertRaisesRegex(
	ValueError, "transform_batch cannot produce aggregated results"
	):
	psdf.pandas_on_spark.transform_batch(lambda pdf: pd.Series(1))

	# multi-index columns
	columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "c")])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda x: x + 1).sort_index(),
	(pdf + 1).sort_index(),
	)
	with option_context("compute.shortcut_limit", 500):
	self.assert_eq(
	psdf.pandas_on_spark.transform_batch(lambda x: x + 1).sort_index(),
	(pdf + 1).sort_index(),
	)

	def test_transform_batch_with_type(self):
	pdf = self.pdf
	psdf = ps.from_pandas(pdf)

	def identify1(x) -> ps.DataFrame[int, int]:
	return x

	# Type hints set the default column names, and we use default index for
	# pandas API on Spark. Here we ignore both diff.
	actual = psdf.pandas_on_spark.transform_batch(identify1)
	expected = pdf
	self.assert_eq(sorted(actual["c0"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["c1"].to_numpy()), sorted(expected["b"].to_numpy()))

	def identify2(x) -> ps.DataFrame[slice("a", int), slice("b", int)]: # noqa: F405
	return x

	actual = psdf.pandas_on_spark.transform_batch(identify2)
	expected = pdf
	self.assert_eq(sorted(actual["a"].to_numpy()), sorted(expected["a"].to_numpy()))
	self.assert_eq(sorted(actual["b"].to_numpy()), sorted(expected["b"].to_numpy()))

	def test_transform_batch_same_anchor(self):
	psdf = ps.range(10)
	psdf["d"] = psdf.pandas_on_spark.transform_batch(lambda pdf: pdf.id + 1)
	self.assert_eq(
	psdf,
	pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
	)

	psdf = ps.range(10)

	def plus_one(pdf) -> ps.Series[np.int64]:
	return pdf.id + 1

	psdf["d"] = psdf.pandas_on_spark.transform_batch(plus_one)
	self.assert_eq(
	psdf,
	pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
	)

	psdf = ps.range(10)

	def plus_one(ser) -> ps.Series[np.int64]:
	return ser + 1

	psdf["d"] = psdf.id.pandas_on_spark.transform_batch(plus_one)
	self.assert_eq(
	psdf,
	pd.DataFrame({"id": list(range(10)), "d": list(range(1, 11))}, columns=["id", "d"]),
	)

	def test_pipe(self):
	psdf = ps.DataFrame(
	{"category": ["A", "A", "B"], "col1": [1, 2, 3], "col2": [4, 5, 6]},
	columns=["category", "col1", "col2"],
	)

	self.assertRaisesRegex(
	ValueError,
	"arg is both the pipe target and a keyword argument",
	lambda: psdf.pipe((lambda x: x, "arg"), arg="1"),
	)

	def test_aggregate(self):
	pdf = pd.DataFrame(
	[[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=["A", "B", "C"]
	)
	psdf = ps.from_pandas(pdf)

	self.assert_eq(
	psdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(), # TODO?: fix column order
	pdf.agg(["sum", "min"])[["A", "B", "C"]].sort_index(),
	)
	self.assert_eq(
	psdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
	pdf.agg({"A": ["sum", "min"], "B": ["min", "max"]})[["A", "B"]].sort_index(),
	)

	self.assertRaises(KeyError, lambda: psdf.agg({"A": ["sum", "min"], "X": ["min", "max"]}))

	# multi-index columns
	columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
	pdf.agg(["sum", "min"])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index(),
	)
	self.assert_eq(
	psdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
	[("X", "A"), ("X", "B")]
	].sort_index(),
	pdf.agg({("X", "A"): ["sum", "min"], ("X", "B"): ["min", "max"]})[
	[("X", "A"), ("X", "B")]
	].sort_index(),
	)

	self.assertRaises(TypeError, lambda: psdf.agg({"X": ["sum", "min"], "Y": ["min", "max"]}))

	# non-string names
	pdf = pd.DataFrame(
	[[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]], columns=[10, 20, 30]
	)
	psdf = ps.from_pandas(pdf)

	self.assert_eq(
	psdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
	pdf.agg(["sum", "min"])[[10, 20, 30]].sort_index(),
	)
	self.assert_eq(
	psdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
	pdf.agg({10: ["sum", "min"], 20: ["min", "max"]})[[10, 20]].sort_index(),
	)

	columns = pd.MultiIndex.from_tuples([("X", 10), ("X", 20), ("Y", 30)])
	pdf.columns = columns
	psdf.columns = columns

	self.assert_eq(
	psdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
	pdf.agg(["sum", "min"])[[("X", 10), ("X", 20), ("Y", 30)]].sort_index(),
	)
	self.assert_eq(
	psdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
	[("X", 10), ("X", 20)]
	].sort_index(),
	pdf.agg({("X", 10): ["sum", "min"], ("X", 20): ["min", "max"]})[
	[("X", 10), ("X", 20)]
	].sort_index(),
	)

	pdf = pd.DataFrame(
	[datetime(2019, 2, 2, 0, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0, 0)],
	columns=["timestamp"],
	)
	psdf = ps.from_pandas(pdf)

	self.assert_eq(psdf.timestamp.min(), pdf.timestamp.min())
	self.assert_eq(psdf.timestamp.max(), pdf.timestamp.max())

	self.assertRaises(ValueError, lambda: psdf.agg(("sum", "min")))


	class FrameApplyFunctionTests(
	FrameApplyFunctionMixin,
	PandasOnSparkTestCase,
	SQLTestUtils,
	):
	pass


	if __name__ == "__main__":
	from pyspark.pandas.tests.computation.test_apply_func import * # noqa: F401

	try:
	import xmlrunner

	testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
	except ImportError:
	testRunner = None
	unittest.main(testRunner=testRunner, verbosity=2)