blob: 077a8e31d20ad58adc6d7ad61dcff3a1b1f39662 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
from decimal import Decimal
import numpy as np
import pandas as pd
from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
class SeriesStatMixin:
def test_nunique(self):
pser = pd.Series([1, 2, 1, np.nan])
psser = ps.from_pandas(pser)
# Assert NaNs are dropped by default
nunique_result = psser.nunique()
self.assertEqual(nunique_result, 2)
self.assert_eq(nunique_result, pser.nunique())
# Assert including NaN values
nunique_result = psser.nunique(dropna=False)
self.assertEqual(nunique_result, 3)
self.assert_eq(nunique_result, pser.nunique(dropna=False))
# Assert approximate counts
self.assertEqual(ps.Series(range(100)).nunique(approx=True), 103)
self.assertEqual(ps.Series(range(100)).nunique(approx=True, rsd=0.01), 100)
def test_value_counts(self):
# this is also containing test for Index & MultiIndex
pser = pd.Series(
[1, 2, 1, 3, 3, np.nan, 1, 4, 2, np.nan, 3, np.nan, 3, 1, 3],
index=[1, 2, 1, 3, 3, np.nan, 1, 4, 2, np.nan, 3, np.nan, 3, 1, 3],
name="x",
)
psser = ps.from_pandas(pser)
exp = pser.value_counts()
res = psser.value_counts()
self.assertEqual(res.name, exp.name)
self.assert_eq(res, exp)
self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True))
self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True))
self.assert_eq(
psser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True)
)
self.assert_eq(
psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True)
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
)
with self.assertRaisesRegex(
NotImplementedError, "value_counts currently does not support bins"
):
psser.value_counts(bins=3)
pser.name = "index"
psser.name = "index"
self.assert_eq(psser.value_counts(), pser.value_counts())
# Series from DataFrame
pdf = pd.DataFrame({"a": [2, 2, 3], "b": [None, 1, None]})
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.a.value_counts(normalize=True), pdf.a.value_counts(normalize=True))
self.assert_eq(psdf.a.value_counts(ascending=True), pdf.a.value_counts(ascending=True))
self.assert_eq(
psdf.a.value_counts(normalize=True, dropna=False),
pdf.a.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psdf.a.value_counts(ascending=True, dropna=False),
pdf.a.value_counts(ascending=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True)
)
self.assert_eq(
psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True)
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
)
# Series with NaN index
pser = pd.Series([3, 2, 3, 1, 2, 3], index=[2.0, None, 5.0, 5.0, None, 5.0])
psser = ps.from_pandas(pser)
self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True))
self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True))
self.assert_eq(
psser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(normalize=True), pser.index.value_counts(normalize=True)
)
self.assert_eq(
psser.index.value_counts(ascending=True), pser.index.value_counts(ascending=True)
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
)
# Series with MultiIndex
pser.index = pd.MultiIndex.from_tuples(
[("x", "a"), ("x", "b"), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")]
)
psser = ps.from_pandas(pser)
self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True))
self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True))
self.assert_eq(
psser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False),
)
# FIXME: MultiIndex.value_counts returns wrong indices.
self.assert_eq(
psser.index.value_counts(normalize=True),
pser.index.value_counts(normalize=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True),
pser.index.value_counts(ascending=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
almost=True,
)
# Series with MultiIndex some of index has NaN
pser.index = pd.MultiIndex.from_tuples(
[("x", "a"), ("x", None), ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")]
)
psser = ps.from_pandas(pser)
self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True))
self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True))
self.assert_eq(
psser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False),
)
# FIXME: MultiIndex.value_counts returns wrong indices.
self.assert_eq(
psser.index.value_counts(normalize=True),
pser.index.value_counts(normalize=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True),
pser.index.value_counts(ascending=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
almost=True,
)
# Series with MultiIndex some of index is NaN.
pser.index = pd.MultiIndex.from_tuples(
[("x", "a"), None, ("y", "c"), ("x", "a"), ("y", "c"), ("x", "a")]
)
psser = ps.from_pandas(pser)
self.assert_eq(psser.value_counts(normalize=True), pser.value_counts(normalize=True))
self.assert_eq(psser.value_counts(ascending=True), pser.value_counts(ascending=True))
self.assert_eq(
psser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False),
)
self.assert_eq(
psser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False),
)
# FIXME: MultiIndex.value_counts returns wrong indices.
self.assert_eq(
psser.index.value_counts(normalize=True),
pser.index.value_counts(normalize=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True),
pser.index.value_counts(ascending=True),
almost=True,
)
self.assert_eq(
psser.index.value_counts(normalize=True, dropna=False),
pser.index.value_counts(normalize=True, dropna=False),
almost=True,
)
self.assert_eq(
psser.index.value_counts(ascending=True, dropna=False),
pser.index.value_counts(ascending=True, dropna=False),
almost=True,
)
def test_nsmallest(self):
sample_lst = [1, 2, 3, 4, np.nan, 6]
pser = pd.Series(sample_lst, name="x")
psser = ps.Series(sample_lst, name="x")
self.assert_eq(psser.nsmallest(n=3), pser.nsmallest(n=3))
self.assert_eq(psser.nsmallest(), pser.nsmallest())
self.assert_eq((psser + 1).nsmallest(), (pser + 1).nsmallest())
def test_nlargest(self):
sample_lst = [1, 2, 3, 4, np.nan, 6]
pser = pd.Series(sample_lst, name="x")
psser = ps.Series(sample_lst, name="x")
self.assert_eq(psser.nlargest(n=3), pser.nlargest(n=3))
self.assert_eq(psser.nlargest(), pser.nlargest())
self.assert_eq((psser + 1).nlargest(), (pser + 1).nlargest())
def test_is_unique(self):
# We can't use pandas' is_unique for comparison. pandas 0.23 ignores None
pser = pd.Series([1, 2, 2, None, None])
psser = ps.from_pandas(pser)
self.assertEqual(False, psser.is_unique)
self.assertEqual(False, (psser + 1).is_unique)
pser = pd.Series([1, None, None])
psser = ps.from_pandas(pser)
self.assertEqual(False, psser.is_unique)
self.assertEqual(False, (psser + 1).is_unique)
pser = pd.Series([1])
psser = ps.from_pandas(pser)
self.assertEqual(pser.is_unique, psser.is_unique)
self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique)
pser = pd.Series([1, 1, 1])
psser = ps.from_pandas(pser)
self.assertEqual(pser.is_unique, psser.is_unique)
self.assertEqual((pser + 1).is_unique, (psser + 1).is_unique)
def test_median(self):
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a")
def test_rank(self):
pser = pd.Series([1, 2, 3, 1], name="x")
psser = ps.from_pandas(pser)
self.assert_eq(pser.rank(), psser.rank().sort_index())
self.assert_eq(pser.rank().sum(), psser.rank().sum())
self.assert_eq(pser.rank(ascending=False), psser.rank(ascending=False).sort_index())
self.assert_eq(pser.rank(method="min"), psser.rank(method="min").sort_index())
self.assert_eq(pser.rank(method="max"), psser.rank(method="max").sort_index())
self.assert_eq(pser.rank(method="first"), psser.rank(method="first").sort_index())
self.assert_eq(pser.rank(method="dense"), psser.rank(method="dense").sort_index())
non_numeric_pser = pd.Series(["a", "c", "b", "d"], name="x", index=[10, 11, 12, 13])
non_numeric_psser = ps.from_pandas(non_numeric_pser)
self.assert_eq(
non_numeric_pser.rank(numeric_only=None),
non_numeric_psser.rank(numeric_only=None).sort_index(),
)
self.assert_eq(
non_numeric_pser.rank(numeric_only=False),
non_numeric_psser.rank(numeric_only=False).sort_index(),
)
msg = "Series.rank does not allow numeric_only=True with non-numeric dtype."
with self.assertRaisesRegex(TypeError, msg):
non_numeric_psser.rank(numeric_only=True)
msg = "Series.rank does not allow numeric_only=True with non-numeric dtype."
with self.assertRaisesRegex(TypeError, msg):
(non_numeric_psser + "x").rank(numeric_only=True)
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
with self.assertRaisesRegex(ValueError, msg):
psser.rank(method="nothing")
msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'"
with self.assertRaisesRegex(ValueError, msg):
psser.rank(method="nothing")
midx = pd.MultiIndex.from_tuples([("a", "b"), ("a", "c"), ("b", "c"), ("c", "d")])
pser.index = midx
psser = ps.from_pandas(pser)
msg = "rank do not support MultiIndex now"
with self.assertRaisesRegex(NotImplementedError, msg):
psser.rank(method="min")
def test_round(self):
pser = pd.Series([0.028208, 0.038683, 0.877076], name="x")
psser = ps.from_pandas(pser)
self.assert_eq(pser.round(2), psser.round(2))
msg = "decimals must be an integer"
with self.assertRaisesRegex(TypeError, msg):
psser.round(1.5)
def test_quantile(self):
pser = pd.Series([])
psser = ps.from_pandas(pser)
self.assert_eq(psser.quantile(0.5), pser.quantile(0.5))
self.assert_eq(psser.quantile([0.25, 0.5, 0.75]), pser.quantile([0.25, 0.5, 0.75]))
with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(accuracy="a")
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=1)
with self.assertRaisesRegex(TypeError, "q must be a float or an array of floats;"):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])
with self.assertRaisesRegex(
ValueError, "percentiles should all be in the interval \\[0, 1\\]"
):
ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=1.1)
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ps.Series(["a", "b", "c"]).quantile()
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ps.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])
def test_pct_change(self):
pser = pd.Series([90, 91, 85], index=[2, 4, 1])
psser = ps.from_pandas(pser)
self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False)
self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True)
self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False)
self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False)
self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000))
self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000))
# for MultiIndex
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
)
pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
psser = ps.from_pandas(pser)
self.assert_eq(psser.pct_change(), pser.pct_change(), check_exact=False)
self.assert_eq(psser.pct_change().sum(), pser.pct_change().sum(), almost=True)
self.assert_eq(psser.pct_change(periods=2), pser.pct_change(periods=2), check_exact=False)
self.assert_eq(psser.pct_change(periods=-1), pser.pct_change(periods=-1), check_exact=False)
self.assert_eq(psser.pct_change(periods=-100000000), pser.pct_change(periods=-100000000))
self.assert_eq(psser.pct_change(periods=100000000), pser.pct_change(periods=100000000))
def test_divmod(self):
pser = pd.Series([100, None, 300, None, 500], name="Koalas")
psser = ps.from_pandas(pser)
kdiv, kmod = psser.divmod(-100)
pdiv, pmod = pser.divmod(-100)
self.assert_eq(kdiv, pdiv)
self.assert_eq(kmod, pmod)
kdiv, kmod = psser.divmod(100)
pdiv, pmod = pser.divmod(100)
self.assert_eq(kdiv, pdiv)
self.assert_eq(kmod, pmod)
def test_rdivmod(self):
pser = pd.Series([100, None, 300, None, 500])
psser = ps.from_pandas(pser)
krdiv, krmod = psser.rdivmod(-100)
prdiv, prmod = pser.rdivmod(-100)
self.assert_eq(krdiv, prdiv)
self.assert_eq(krmod, prmod)
krdiv, krmod = psser.rdivmod(100)
prdiv, prmod = pser.rdivmod(100)
self.assert_eq(krdiv, prdiv)
self.assert_eq(krmod, prmod)
def test_mod(self):
pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
psser = ps.from_pandas(pser)
self.assert_eq(psser.mod(-150), pser.mod(-150))
self.assert_eq(psser.mod(0), pser.mod(0))
self.assert_eq(psser.mod(150), pser.mod(150))
pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6})
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.a.mod(psdf.b), pdf.a.mod(pdf.b))
def test_mode(self):
pser = pd.Series([0, 0, 1, 1, 1, np.nan, np.nan, np.nan])
psser = ps.from_pandas(pser)
self.assert_eq(psser.mode(), pser.mode())
self.assert_eq(
psser.mode(dropna=False).sort_values().reset_index(drop=True),
pser.mode(dropna=False).sort_values().reset_index(drop=True),
)
pser.name = "x"
psser = ps.from_pandas(pser)
self.assert_eq(psser.mode(), pser.mode())
self.assert_eq(
psser.mode(dropna=False).sort_values().reset_index(drop=True),
pser.mode(dropna=False).sort_values().reset_index(drop=True),
)
def test_rmod(self):
pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
psser = ps.from_pandas(pser)
self.assert_eq(psser.rmod(-150), pser.rmod(-150))
self.assert_eq(psser.rmod(0), pser.rmod(0))
self.assert_eq(psser.rmod(150), pser.rmod(150))
pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6})
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.a.rmod(psdf.b), pdf.a.rmod(pdf.b))
def test_div_zero_and_nan(self):
pser = pd.Series([100, None, -300, None, 500, -700, np.inf, -np.inf], name="Koalas")
psser = ps.from_pandas(pser)
self.assert_eq(pser.div(0), psser.div(0))
self.assert_eq(pser.truediv(0), psser.truediv(0))
self.assert_eq(pser / 0, psser / 0)
self.assert_eq(pser.div(np.nan), psser.div(np.nan))
self.assert_eq(pser.truediv(np.nan), psser.truediv(np.nan))
self.assert_eq(pser / np.nan, psser / np.nan)
self.assert_eq(pser.floordiv(0), psser.floordiv(0))
self.assert_eq(pser // 0, psser // 0)
self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan))
def test_product(self):
pser = pd.Series([10, 20, 30, 40, 50])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(), psser.prod())
# Containing NA values
pser = pd.Series([10, np.nan, 30, np.nan, 50])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(), psser.prod(), almost=True)
# All-NA values
pser = pd.Series([np.nan, np.nan, np.nan])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(), psser.prod())
# Boolean Series
pser = pd.Series([True, True, True])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(numeric_only=True), psser.prod())
pser = pd.Series([False, False, False])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(numeric_only=True), psser.prod())
pser = pd.Series([True, False, True])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(numeric_only=True), psser.prod())
# With `min_count` parameter
pser = pd.Series([10, 20, 30, 40, 50])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(min_count=5), psser.prod(min_count=5))
self.assert_eq(pser.prod(min_count=6), psser.prod(min_count=6))
pser = pd.Series([10, np.nan, 30, np.nan, 50])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(min_count=3), psser.prod(min_count=3), almost=True)
self.assert_eq(pser.prod(min_count=4), psser.prod(min_count=4))
pser = pd.Series([np.nan, np.nan, np.nan])
psser = ps.from_pandas(pser)
self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1))
with self.assertRaisesRegex(TypeError, "Could not convert object \\(void\\) to numeric"):
ps.Series([]).prod(numeric_only=True)
with self.assertRaisesRegex(TypeError, "Could not convert object \\(void\\) to numeric"):
ps.Series([]).prod(min_count=1)
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ps.Series(["a", "b", "c"]).prod()
with self.assertRaisesRegex(
TypeError, "Could not convert datetime64\\[ns\\] \\(timestamp.*\\) to numeric"
):
ps.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()
with self.assertRaisesRegex(NotImplementedError, "Series does not support columns axis."):
psser.prod(axis=1)
def test_hasnans(self):
# BooleanType
pser = pd.Series([True, False, True, True])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
pser = pd.Series([True, False, np.nan, True])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
# TimestampType
pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
# DecimalType
pser = pd.Series([Decimal("0.1"), Decimal("NaN")])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
# empty
pser = pd.Series([])
psser = ps.from_pandas(pser)
self.assert_eq(pser.hasnans, psser.hasnans)
def test_pow_and_rpow(self):
pser = pd.Series([1, 2, np.nan])
psser = ps.from_pandas(pser)
self.assert_eq(pser.pow(np.nan), psser.pow(np.nan))
self.assert_eq(pser**np.nan, psser**np.nan)
self.assert_eq(pser.rpow(np.nan), psser.rpow(np.nan))
self.assert_eq(1**pser, 1**psser)
def test_autocorr(self):
pdf = pd.DataFrame({"s1": [0.90010907, 0.13484424, 0.62036035]})
self._test_autocorr(pdf)
pdf = pd.DataFrame({"s1": [0.90010907, np.nan, 0.13484424, 0.62036035]})
self._test_autocorr(pdf)
pdf = pd.DataFrame({"s1": [0.2, 0.0, 0.6, 0.2, np.nan, 0.5, 0.6]})
self._test_autocorr(pdf)
psser = ps.from_pandas(pdf["s1"])
with self.assertRaisesRegex(TypeError, r"lag should be an int; however, got"):
psser.autocorr(1.0)
psser = ps.Series([1, 0, 0, 0])
self.assertTrue(bool(np.isnan(psser.autocorr())))
def _test_autocorr(self, pdf):
psdf = ps.from_pandas(pdf)
for lag in range(-10, 10):
p_autocorr = pdf["s1"].autocorr(lag)
ps_autocorr = psdf["s1"].autocorr(lag)
self.assert_eq(p_autocorr, ps_autocorr, almost=True)
def test_cov(self):
pdf = pd.DataFrame(
{
"s1": ["a", "b", "c"],
"s2": [0.12528585, 0.26962463, 0.51111198],
},
index=[0, 1, 2],
)
psdf = ps.from_pandas(pdf)
with self.assertRaisesRegex(TypeError, "unsupported dtype: object"):
psdf["s1"].cov(psdf["s2"])
with self.assertRaisesRegex(TypeError, "unsupported dtype: object"):
psdf["s2"].cov(psdf["s1"])
with self.assertRaisesRegex(TypeError, "ddof must be integer"):
psdf["s2"].cov(psdf["s2"], ddof="ddof")
pdf = pd.DataFrame(
{
"s1": [0.90010907, 0.13484424, 0.62036035],
"s2": [0.12528585, 0.26962463, 0.51111198],
},
index=[0, 1, 2],
)
self._test_cov(pdf)
pdf = pd.DataFrame(
{
"s1": [0.90010907, np.nan, 0.13484424, 0.62036035],
"s2": [0.12528585, 0.81131178, 0.26962463, 0.51111198],
},
index=[0, 1, 2, 3],
)
self._test_cov(pdf)
def _test_cov(self, pdf):
psdf = ps.from_pandas(pdf)
self.assert_eq(pdf["s1"].cov(pdf["s2"]), psdf["s1"].cov(psdf["s2"]), almost=True)
self.assert_eq(
pdf["s1"].cov(pdf["s2"], ddof=2), psdf["s1"].cov(psdf["s2"], ddof=2), almost=True
)
self.assert_eq(
pdf["s1"].cov(pdf["s2"], min_periods=3),
psdf["s1"].cov(psdf["s2"], min_periods=3),
almost=True,
)
self.assert_eq(
pdf["s1"].cov(pdf["s2"], min_periods=3, ddof=-1),
psdf["s1"].cov(psdf["s2"], min_periods=3, ddof=-1),
almost=True,
)
self.assert_eq(
pdf["s1"].cov(pdf["s2"], min_periods=4),
psdf["s1"].cov(psdf["s2"], min_periods=4),
almost=True,
)
self.assert_eq(
pdf["s1"].cov(pdf["s2"], min_periods=4, ddof=3),
psdf["s1"].cov(psdf["s2"], min_periods=4, ddof=3),
almost=True,
)
def test_series_stat_fail(self):
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).mean()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).skew()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).kurtosis()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).std()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).var()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).median()
with self.assertRaisesRegex(TypeError, "Could not convert object"):
ps.Series(["a", "b", "c"]).sem()
class SeriesStatTests(
SeriesStatMixin,
PandasOnSparkTestCase,
SQLTestUtils,
):
pass
if __name__ == "__main__":
from pyspark.pandas.tests.series.test_stat import * # noqa: F401
try:
import xmlrunner
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)