python/pyspark/pandas/tests/frame/test_axis.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import unittest

 import pandas as pd

 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils


 class FrameAxisMixin:
     def test_axis_on_dataframe(self):
         # The number of each count is intentionally big
         # because when data is small, it executes a shortcut.
         # Less than 'compute.shortcut_limit' will execute a shortcut
         # by using collected pandas dataframe directly.
         # now we set the 'compute.shortcut_limit' as 1000 explicitly
         with ps.option_context("compute.shortcut_limit", 1000):
             pdf = pd.DataFrame(
                 {
                     "A": [1, -2, 3, -4, 5] * 300,
                     "B": [1.0, -2, 3, -4, 5] * 300,
                     "C": [-6.0, -7, -8, -9, 10] * 300,
                     "D": [True, False, True, False, False] * 300,
                 },
                 index=range(10, 15001, 10),
             )
             # TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas regression is fixed
             # There is a regression in Pandas 2.1.0,
             # so we should manually cast to float until the regression is fixed.
             # See https://github.com/pandas-dev/pandas/issues/55194.
             pdf = pdf.astype(float)
             psdf = ps.from_pandas(pdf)
             self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
             self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
             self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
             self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
             self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
             self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
             self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
             self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
             self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
             self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), almost=True)
             self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
             self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
             self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
             self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
             self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
             self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))

             self.assert_eq(
                 psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, numeric_only=True)
             )
             self.assert_eq(psdf.var(axis=1, numeric_only=True), pdf.var(axis=1, numeric_only=True))
             self.assert_eq(
                 psdf.var(axis=1, ddof=0, numeric_only=True),
                 pdf.var(axis=1, ddof=0, numeric_only=True),
             )
             self.assert_eq(psdf.std(axis=1, numeric_only=True), pdf.std(axis=1, numeric_only=True))
             self.assert_eq(
                 psdf.std(axis=1, ddof=0, numeric_only=True),
                 pdf.std(axis=1, ddof=0, numeric_only=True),
             )
             self.assert_eq(
                 psdf.max(axis=1, numeric_only=True),
                 pdf.max(axis=1, numeric_only=True).astype(float),
             )
             self.assert_eq(
                 psdf.min(axis=1, numeric_only=True),
                 pdf.min(axis=1, numeric_only=True).astype(float),
             )
             self.assert_eq(
                 psdf.sum(axis=1, numeric_only=True),
                 pdf.sum(axis=1, numeric_only=True).astype(float),
             )
             self.assert_eq(
                 psdf.product(axis=1, numeric_only=True),
                 pdf.product(axis=1, numeric_only=True).astype(float),
             )
             self.assert_eq(
                 psdf.kurtosis(axis=0, numeric_only=True),
                 pdf.kurtosis(axis=0, numeric_only=True),
                 almost=True,
             )
             self.assert_eq(
                 psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True)
             )
             self.assert_eq(
                 psdf.skew(axis=0, numeric_only=True),
                 pdf.skew(axis=0, numeric_only=True),
                 almost=True,
             )
             self.assert_eq(
                 psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True)
             )
             self.assert_eq(
                 psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, numeric_only=True)
             )
             self.assert_eq(psdf.sem(axis=1, numeric_only=True), pdf.sem(axis=1, numeric_only=True))
             self.assert_eq(
                 psdf.sem(axis=1, ddof=0, numeric_only=True),
                 pdf.sem(axis=1, ddof=0, numeric_only=True),
             )


 class FrameAxisTests(
     FrameAxisMixin,
     PandasOnSparkTestCase,
     SQLTestUtils,
 ):
     pass


 if __name__ == "__main__":
     from pyspark.pandas.tests.frame.test_axis import *  # noqa: F401

     try:
         import xmlrunner

         testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
     except ImportError:
         testRunner = None
     unittest.main(testRunner=testRunner, verbosity=2)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import unittest

	import pandas as pd

	from pyspark import pandas as ps
	from pyspark.testing.pandasutils import PandasOnSparkTestCase
	from pyspark.testing.sqlutils import SQLTestUtils


	class FrameAxisMixin:
	def test_axis_on_dataframe(self):
	# The number of each count is intentionally big
	# because when data is small, it executes a shortcut.
	# Less than 'compute.shortcut_limit' will execute a shortcut
	# by using collected pandas dataframe directly.
	# now we set the 'compute.shortcut_limit' as 1000 explicitly
	with ps.option_context("compute.shortcut_limit", 1000):
	pdf = pd.DataFrame(
	{
	"A": [1, -2, 3, -4, 5] * 300,
	"B": [1.0, -2, 3, -4, 5] * 300,
	"C": [-6.0, -7, -8, -9, 10] * 300,
	"D": [True, False, True, False, False] * 300,
	},
	index=range(10, 15001, 10),
	)
	# TODO(SPARK-45228): Update `test_axis_on_dataframe` when Pandas regression is fixed
	# There is a regression in Pandas 2.1.0,
	# so we should manually cast to float until the regression is fixed.
	# See https://github.com/pandas-dev/pandas/issues/55194.
	pdf = pdf.astype(float)
	psdf = ps.from_pandas(pdf)
	self.assert_eq(psdf.count(axis=1), pdf.count(axis=1))
	self.assert_eq(psdf.var(axis=1), pdf.var(axis=1))
	self.assert_eq(psdf.var(axis=1, ddof=0), pdf.var(axis=1, ddof=0))
	self.assert_eq(psdf.std(axis=1), pdf.std(axis=1))
	self.assert_eq(psdf.std(axis=1, ddof=0), pdf.std(axis=1, ddof=0))
	self.assert_eq(psdf.max(axis=1), pdf.max(axis=1))
	self.assert_eq(psdf.min(axis=1), pdf.min(axis=1))
	self.assert_eq(psdf.sum(axis=1), pdf.sum(axis=1))
	self.assert_eq(psdf.product(axis=1), pdf.product(axis=1))
	self.assert_eq(psdf.kurtosis(axis=0), pdf.kurtosis(axis=0), almost=True)
	self.assert_eq(psdf.kurtosis(axis=1), pdf.kurtosis(axis=1))
	self.assert_eq(psdf.skew(axis=0), pdf.skew(axis=0), almost=True)
	self.assert_eq(psdf.skew(axis=1), pdf.skew(axis=1))
	self.assert_eq(psdf.mean(axis=1), pdf.mean(axis=1))
	self.assert_eq(psdf.sem(axis=1), pdf.sem(axis=1))
	self.assert_eq(psdf.sem(axis=1, ddof=0), pdf.sem(axis=1, ddof=0))

	self.assert_eq(
	psdf.count(axis=1, numeric_only=True), pdf.count(axis=1, numeric_only=True)
	)
	self.assert_eq(psdf.var(axis=1, numeric_only=True), pdf.var(axis=1, numeric_only=True))
	self.assert_eq(
	psdf.var(axis=1, ddof=0, numeric_only=True),
	pdf.var(axis=1, ddof=0, numeric_only=True),
	)
	self.assert_eq(psdf.std(axis=1, numeric_only=True), pdf.std(axis=1, numeric_only=True))
	self.assert_eq(
	psdf.std(axis=1, ddof=0, numeric_only=True),
	pdf.std(axis=1, ddof=0, numeric_only=True),
	)
	self.assert_eq(
	psdf.max(axis=1, numeric_only=True),
	pdf.max(axis=1, numeric_only=True).astype(float),
	)
	self.assert_eq(
	psdf.min(axis=1, numeric_only=True),
	pdf.min(axis=1, numeric_only=True).astype(float),
	)
	self.assert_eq(
	psdf.sum(axis=1, numeric_only=True),
	pdf.sum(axis=1, numeric_only=True).astype(float),
	)
	self.assert_eq(
	psdf.product(axis=1, numeric_only=True),
	pdf.product(axis=1, numeric_only=True).astype(float),
	)
	self.assert_eq(
	psdf.kurtosis(axis=0, numeric_only=True),
	pdf.kurtosis(axis=0, numeric_only=True),
	almost=True,
	)
	self.assert_eq(
	psdf.kurtosis(axis=1, numeric_only=True), pdf.kurtosis(axis=1, numeric_only=True)
	)
	self.assert_eq(
	psdf.skew(axis=0, numeric_only=True),
	pdf.skew(axis=0, numeric_only=True),
	almost=True,
	)
	self.assert_eq(
	psdf.skew(axis=1, numeric_only=True), pdf.skew(axis=1, numeric_only=True)
	)
	self.assert_eq(
	psdf.mean(axis=1, numeric_only=True), pdf.mean(axis=1, numeric_only=True)
	)
	self.assert_eq(psdf.sem(axis=1, numeric_only=True), pdf.sem(axis=1, numeric_only=True))
	self.assert_eq(
	psdf.sem(axis=1, ddof=0, numeric_only=True),
	pdf.sem(axis=1, ddof=0, numeric_only=True),
	)


	class FrameAxisTests(
	FrameAxisMixin,
	PandasOnSparkTestCase,
	SQLTestUtils,
	):
	pass


	if __name__ == "__main__":
	from pyspark.pandas.tests.frame.test_axis import * # noqa: F401

	try:
	import xmlrunner

	testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
	except ImportError:
	testRunner = None
	unittest.main(testRunner=testRunner, verbosity=2)