python/pyspark/pandas/plot/core.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import importlib
 import math

 import pandas as pd
 import numpy as np
 from pandas.core.base import PandasObject
 from pandas.core.dtypes.inference import is_integer

 from pyspark.sql import functions as F, Column
 from pyspark.sql.internal import InternalFunction as SF
 from pyspark.pandas.missing import unsupported_function
 from pyspark.pandas.config import get_option
 from pyspark.pandas.utils import name_like_string


 class TopNPlotBase:
     def get_top_n(self, data):
         from pyspark.pandas import DataFrame, Series

         max_rows = get_option("plotting.max_rows")
         # Simply use the first 1k elements and make it into a pandas dataframe
         # For categorical variables, it is likely called from df.x.value_counts().plot.xxx().
         if isinstance(data, (Series, DataFrame)):
             data = data.head(max_rows + 1)._to_pandas()
         else:
             raise TypeError("Only DataFrame and Series are supported for plotting.")

         self.partial = False
         if len(data) > max_rows:
             self.partial = True
             data = data.iloc[:max_rows]
         return data

     def set_result_text(self, ax):
         max_rows = get_option("plotting.max_rows")
         assert hasattr(self, "partial")

         if self.partial:
             ax.text(
                 1,
                 1,
                 "showing top {} elements only".format(max_rows),
                 size=6,
                 ha="right",
                 va="bottom",
                 transform=ax.transAxes,
             )


 class SampledPlotBase:
     def get_sampled(self, data):
         from pyspark.pandas import DataFrame, Series

         if not isinstance(data, (DataFrame, Series)):
             raise TypeError("Only DataFrame and Series are supported for plotting.")
         if isinstance(data, Series):
             data = data.to_frame()

         fraction = get_option("plotting.sample_ratio")
         if fraction is not None:
             self.fraction = fraction
             sampled = data._internal.resolved_copy.spark_frame.sample(fraction=self.fraction)
             return DataFrame(data._internal.with_new_sdf(sampled))._to_pandas()
         else:
             from pyspark.sql import Observation

             max_rows = get_option("plotting.max_rows")
             observation = Observation("ps plotting")
             sdf = data._internal.resolved_copy.spark_frame.observe(
                 observation, F.count(F.lit(1)).alias("count")
             )

             rand_col_name = "__ps_plotting_sampled_plot_base_rand__"
             id_col_name = "__ps_plotting_sampled_plot_base_id__"

             sampled = (
                 sdf.select(
                     "*",
                     F.rand().alias(rand_col_name),
                     F.monotonically_increasing_id().alias(id_col_name),
                 )
                 .sort(rand_col_name)
                 .limit(max_rows + 1)
                 .coalesce(1)
                 .sortWithinPartitions(id_col_name)
                 .drop(rand_col_name, id_col_name)
             )

             pdf = DataFrame(data._internal.with_new_sdf(sampled))._to_pandas()

             if len(pdf) > max_rows:
                 try:
                     self.fraction = float(max_rows) / observation.get["count"]
                 except Exception:
                     pass
                 return pdf[:max_rows]
             else:
                 self.fraction = 1.0
                 return pdf

     def set_result_text(self, ax):
         assert hasattr(self, "fraction")

         if self.fraction < 1:
             ax.text(
                 1,
                 1,
                 "showing the sampled result by fraction %s" % self.fraction,
                 size=6,
                 ha="right",
                 va="bottom",
                 transform=ax.transAxes,
             )


 class NumericPlotBase:
     @staticmethod
     def prepare_numeric_data(data):
         from pyspark.pandas.series import Series

         if isinstance(data, Series):
             data = data.to_frame()

         numeric_data = data.select_dtypes(
             include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
         )

         # no empty frames or series allowed
         if len(numeric_data.columns) == 0:
             raise TypeError(
                 "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
             )

         return data, numeric_data


 class HistogramPlotBase(NumericPlotBase):
     @staticmethod
     def prepare_hist_data(data, bins):
         data, numeric_data = NumericPlotBase.prepare_numeric_data(data)
         if is_integer(bins):
             # computes boundaries for the column
             bins = HistogramPlotBase.get_bins(data._to_spark(), bins)

         return numeric_data, bins

     @staticmethod
     def get_bins(sdf, bins):
         # 'data' is a Spark DataFrame that selects all columns.
         if len(sdf.columns) > 1:
             min_col = F.least(*map(F.min, sdf))
             max_col = F.greatest(*map(F.max, sdf))
         else:
             min_col = F.min(sdf.columns[-1])
             max_col = F.max(sdf.columns[-1])
         boundaries = sdf.select(min_col, max_col).first()

         # divides the boundaries into bins
         if boundaries[0] == boundaries[1]:
             boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)

         return np.linspace(boundaries[0], boundaries[1], bins + 1)

     @staticmethod
     def compute_hist(psdf, bins):
         # 'data' is a Spark DataFrame that selects one column.
         assert isinstance(bins, (np.ndarray, np.generic))
         assert len(bins) > 2, "the number of buckets must be higher than 2."

         sdf = psdf._internal.spark_frame
         scols = []
         input_column_names = []
         for label in psdf._internal.column_labels:
             input_column_name = name_like_string(label)
             input_column_names.append(input_column_name)
             scols.append(psdf._internal.spark_column_for(label).alias(input_column_name))
         sdf = sdf.select(*scols)

         # 1. Make the bucket output flat to:
         #     +----------+-------+
         #     |__group_id|buckets|
         #     +----------+-------+
         #     |0         |0.0    |
         #     |0         |0.0    |
         #     |0         |1.0    |
         #     |0         |2.0    |
         #     |0         |3.0    |
         #     |0         |3.0    |
         #     |1         |0.0    |
         #     |1         |1.0    |
         #     |1         |1.0    |
         #     |1         |2.0    |
         #     |1         |1.0    |
         #     |1         |0.0    |
         #     +----------+-------+
         colnames = sdf.columns
         bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

         # refers to org.apache.spark.ml.feature.Bucketizer#binarySearchForBuckets
         def binary_search_for_buckets(value: Column):
             index = SF.array_binary_search(F.lit(bins), value)
             bucket = F.when(index >= 0, index).otherwise(-index - 2)
             unboundErrMsg = F.lit(f"value %s out of the bins bounds: [{bins[0]}, {bins[-1]}]")
             return (
                 F.when(value == F.lit(bins[-1]), F.lit(len(bins) - 2))
                 .when(value.between(F.lit(bins[0]), F.lit(bins[-1])), bucket)
                 .otherwise(F.raise_error(F.printf(unboundErrMsg, value)))
             )

         output_df = (
             sdf.select(
                 F.posexplode(
                     F.array([F.col(colname).cast("double") for colname in colnames])
                 ).alias("__group_id", "__value")
             )
             .where(F.col("__value").isNotNull() & ~F.col("__value").isNaN())
             .select(
                 F.col("__group_id"),
                 binary_search_for_buckets(F.col("__value")).cast("double").alias("__bucket"),
             )
         )

         # 2. Calculate the count based on each group and bucket.
         #     +----------+-------+------+
         #     |__group_id|buckets| count|
         #     +----------+-------+------+
         #     |0         |0.0    |2     |
         #     |0         |1.0    |1     |
         #     |0         |2.0    |1     |
         #     |0         |3.0    |2     |
         #     |1         |0.0    |2     |
         #     |1         |1.0    |3     |
         #     |1         |2.0    |1     |
         #     +----------+-------+------+
         result = (
             output_df.groupby("__group_id", "__bucket")
             .agg(F.count("*").alias("count"))
             .toPandas()
             .sort_values(by=["__group_id", "__bucket"])
         )

         # 3. Fill empty bins and calculate based on each group id. From:
         #     +----------+--------+------+
         #     |__group_id|__bucket| count|
         #     +----------+--------+------+
         #     |0         |0.0     |2     |
         #     |0         |1.0     |1     |
         #     |0         |2.0     |1     |
         #     |0         |3.0     |2     |
         #     +----------+--------+------+
         #     +----------+--------+------+
         #     |__group_id|__bucket| count|
         #     +----------+--------+------+
         #     |1         |0.0     |2     |
         #     |1         |1.0     |3     |
         #     |1         |2.0     |1     |
         #     +----------+--------+------+
         #
         # to:
         #     +-----------------+
         #     |__values1__bucket|
         #     +-----------------+
         #     |2                |
         #     |1                |
         #     |1                |
         #     |2                |
         #     |0                |
         #     +-----------------+
         #     +-----------------+
         #     |__values2__bucket|
         #     +-----------------+
         #     |2                |
         #     |3                |
         #     |1                |
         #     |0                |
         #     |0                |
         #     +-----------------+
         output_series = []
         for i, (input_column_name, bucket_name) in enumerate(zip(input_column_names, bucket_names)):
             current_bucket_result = result[result["__group_id"] == i]
             # generates a pandas DF with one row for each bin
             # we need this as some of the bins may be empty
             indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
             # merges the bins with counts on it and fills remaining ones with zeros
             pdf = indexes.merge(current_bucket_result, how="left", on=["__bucket"]).fillna(0)[
                 ["count"]
             ]
             pdf.columns = [input_column_name]
             output_series.append(pdf[input_column_name])

         return output_series


 class BoxPlotBase:
     @staticmethod
     def compute_box(sdf, colnames, whis, precision, showfliers):
         assert len(colnames) > 0
         formatted_colnames = ["`{}`".format(colname) for colname in colnames]

         stats_scols = []
         for i, colname in enumerate(formatted_colnames):
             percentiles = F.percentile_approx(colname, [0.25, 0.50, 0.75], int(1.0 / precision))
             q1 = F.get(percentiles, 0)
             med = F.get(percentiles, 1)
             q3 = F.get(percentiles, 2)
             iqr = q3 - q1
             lfence = q1 - F.lit(whis) * iqr
             ufence = q3 + F.lit(whis) * iqr

             stats_scols.append(
                 F.struct(
                     F.mean(colname).alias("mean"),
                     med.alias("med"),
                     q1.alias("q1"),
                     q3.alias("q3"),
                     lfence.alias("lfence"),
                     ufence.alias("ufence"),
                 ).alias(f"_box_plot_stats_{i}")
             )

         sdf_stats = sdf.select(*stats_scols)

         result_scols = []
         for i, colname in enumerate(formatted_colnames):
             value = F.col(colname)

             lfence = F.col(f"_box_plot_stats_{i}.lfence")
             ufence = F.col(f"_box_plot_stats_{i}.ufence")
             mean = F.col(f"_box_plot_stats_{i}.mean")
             med = F.col(f"_box_plot_stats_{i}.med")
             q1 = F.col(f"_box_plot_stats_{i}.q1")
             q3 = F.col(f"_box_plot_stats_{i}.q3")

             outlier = ~value.between(lfence, ufence)

             # Computes min and max values of non-outliers - the whiskers
             upper_whisker = F.max(F.when(~outlier, value).otherwise(F.lit(None)))
             lower_whisker = F.min(F.when(~outlier, value).otherwise(F.lit(None)))

             # If it shows fliers, take the top 1k with the highest absolute values
             # Here we normalize the values by subtracting the median.
             if showfliers:
                 pair = F.when(
                     outlier,
                     F.struct(F.abs(value - med), value.alias("val")),
                 ).otherwise(F.lit(None))
                 topk = SF.collect_top_k(pair, 1001, False)
                 fliers = F.when(F.size(topk) > 0, topk["val"]).otherwise(F.lit(None))
             else:
                 fliers = F.lit(None)

             result_scols.append(
                 F.struct(
                     F.first(mean).alias("mean"),
                     F.first(med).alias("med"),
                     F.first(q1).alias("q1"),
                     F.first(q3).alias("q3"),
                     upper_whisker.alias("upper_whisker"),
                     lower_whisker.alias("lower_whisker"),
                     fliers.alias("fliers"),
                 ).alias(f"_box_plot_results_{i}")
             )

         sdf_result = sdf.join(sdf_stats.hint("broadcast")).select(*result_scols)
         return sdf_result.first()


 class KdePlotBase(NumericPlotBase):
     @staticmethod
     def prepare_kde_data(data):
         _, numeric_data = NumericPlotBase.prepare_numeric_data(data)
         return numeric_data

     @staticmethod
     def get_ind(sdf, ind):
         def calc_min_max():
             if len(sdf.columns) > 1:
                 min_col = F.least(*map(F.min, sdf))
                 max_col = F.greatest(*map(F.max, sdf))
             else:
                 min_col = F.min(sdf.columns[-1])
                 max_col = F.max(sdf.columns[-1])
             return sdf.select(min_col, max_col).first()

         if ind is None:
             min_val, max_val = calc_min_max()
             sample_range = max_val - min_val
             ind = np.linspace(
                 min_val - 0.5 * sample_range,
                 max_val + 0.5 * sample_range,
                 1000,
             )
         elif is_integer(ind):
             min_val, max_val = calc_min_max()
             sample_range = max_val - min_val
             ind = np.linspace(
                 min_val - 0.5 * sample_range,
                 max_val + 0.5 * sample_range,
                 ind,
             )
         return ind

     @staticmethod
     def compute_kde_col(input_col, bw_method=None, ind=None):
         # refers to org.apache.spark.mllib.stat.KernelDensity
         assert bw_method is not None and isinstance(
             bw_method, (int, float)
         ), "'bw_method' must be set as a scalar number."

         assert ind is not None, "'ind' must be a scalar array."

         bandwidth = float(bw_method)
         points = [float(i) for i in ind]
         log_std_plus_half_log2_pi = math.log(bandwidth) + 0.5 * math.log(2 * math.pi)

         def norm_pdf(
             mean: Column,
             std: Column,
             log_std_plus_half_log2_pi: Column,
             x: Column,
         ) -> Column:
             x0 = x - mean
             x1 = x0 / std
             log_density = -0.5 * x1 * x1 - log_std_plus_half_log2_pi
             return F.exp(log_density)

         return F.array(
             [
                 F.avg(
                     norm_pdf(
                         input_col.cast("double"),
                         F.lit(bandwidth),
                         F.lit(log_std_plus_half_log2_pi),
                         F.lit(point),
                     )
                 )
                 for point in points
             ]
         )

     @staticmethod
     def compute_kde(sdf, bw_method=None, ind=None):
         input_col = F.col(sdf.columns[0])
         kde_col = KdePlotBase.compute_kde_col(input_col, bw_method, ind).alias("kde")
         row = sdf.select(kde_col).first()
         return row[0]


 class PandasOnSparkPlotAccessor(PandasObject):
     """
     Series/Frames plotting accessor and method.

     Uses the backend specified by the
     option ``plotting.backend``. By default, plotly is used.

     Plotting methods can also be accessed by calling the accessor as a method
     with the ``kind`` argument:
     ``s.plot(kind='hist')`` is equivalent to ``s.plot.hist()``
     """

     pandas_plot_data_map = {
         "pie": TopNPlotBase().get_top_n,
         "bar": TopNPlotBase().get_top_n,
         "barh": TopNPlotBase().get_top_n,
         "scatter": SampledPlotBase().get_sampled,
         "area": SampledPlotBase().get_sampled,
         "line": SampledPlotBase().get_sampled,
     }
     _backends = {}  # type: ignore[var-annotated]

     def __init__(self, data):
         self.data = data

     @staticmethod
     def _find_backend(backend):
         """
         Find a pandas-on-Spark plotting backend
         """
         try:
             return PandasOnSparkPlotAccessor._backends[backend]
         except KeyError:
             try:
                 module = importlib.import_module(backend)
             except ImportError:
                 # We re-raise later on.
                 pass
             else:
                 if hasattr(module, "plot") or hasattr(module, "plot_pandas_on_spark"):
                     # Validate that the interface is implemented when the option
                     # is set, rather than at plot time.
                     PandasOnSparkPlotAccessor._backends[backend] = module
                     return module

         raise ValueError(
             "Could not find plotting backend '{backend}'. Ensure that you've installed "
             "the package providing the '{backend}' entrypoint, or that the package has a "
             "top-level `.plot` method.".format(backend=backend)
         )

     @staticmethod
     def _get_plot_backend(backend=None):
         backend = backend or get_option("plotting.backend")
         # Shortcut
         if backend in PandasOnSparkPlotAccessor._backends:
             return PandasOnSparkPlotAccessor._backends[backend]

         if backend == "matplotlib":
             # Because matplotlib is an optional dependency,
             # we need to attempt an import here to raise an ImportError if needed.
             try:
                 # test if matplotlib can be imported
                 import matplotlib  # noqa: F401
                 from pyspark.pandas.plot import matplotlib as module
             except ImportError:
                 raise ImportError(
                     "matplotlib is required for plotting when the "
                     "default backend 'matplotlib' is selected."
                 ) from None

             PandasOnSparkPlotAccessor._backends["matplotlib"] = module
         elif backend == "plotly":
             try:
                 # test if plotly can be imported
                 import plotly  # noqa: F401
                 from pyspark.pandas.plot import plotly as module
             except ImportError:
                 raise ImportError(
                     "plotly is required for plotting when the "
                     "default backend 'plotly' is selected."
                 ) from None

             PandasOnSparkPlotAccessor._backends["plotly"] = module
         else:
             module = PandasOnSparkPlotAccessor._find_backend(backend)
             PandasOnSparkPlotAccessor._backends[backend] = module
         return module

     def __call__(self, kind="line", backend=None, **kwargs):
         plot_backend = PandasOnSparkPlotAccessor._get_plot_backend(backend)
         plot_data = self.data

         if hasattr(plot_backend, "plot_pandas_on_spark"):
             # use if there's pandas-on-Spark specific method.
             return plot_backend.plot_pandas_on_spark(plot_data, kind=kind, **kwargs)
         else:
             # fallback to use pandas'
             if not PandasOnSparkPlotAccessor.pandas_plot_data_map[kind]:
                 raise NotImplementedError(
                     "'%s' plot is not supported with '%s' plot "
                     "backend yet." % (kind, plot_backend.__name__)
                 )
             plot_data = PandasOnSparkPlotAccessor.pandas_plot_data_map[kind](plot_data)
             return plot_backend.plot(plot_data, kind=kind, **kwargs)

     def line(self, x=None, y=None, **kwargs):
         """
         Plot DataFrame/Series as lines.

         This function is useful to plot lines using DataFrame’s values
         as coordinates.

         Parameters
         ----------
         x : int or str, optional
             Columns to use for the horizontal axis.
             Either the location or the label of the columns to be used.
             By default, it will use the DataFrame indices.
         y : int, str, or list of them, optional
             The values to be plotted.
             Either the location or the label of the columns to be used.
             By default, it will use the remaining DataFrame numeric columns.
         **kwds
             Keyword arguments to pass on to :meth:`Series.plot` or :meth:`DataFrame.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         See Also
         --------
         plotly.express.line : Plot y versus x as lines and/or markers (plotly).
         matplotlib.pyplot.plot : Plot y versus x as lines and/or markers (matplotlib).

         Examples
         --------
         Basic plot.

         For Series:

         .. plotly::

             >>> s = ps.Series([1, 3, 2])
             >>> s.plot.line()  # doctest: +SKIP

         For DataFrame:

         .. plotly::

             The following example shows the populations for some animals
             over the years.

             >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
             ...                    'horse': [4, 25, 281, 600, 1900]},
             ...                   index=[1990, 1997, 2003, 2009, 2014])
             >>> df.plot.line()  # doctest: +SKIP

         .. plotly::

             The following example shows the relationship between both
             populations.

             >>> df = ps.DataFrame({'pig': [20, 18, 489, 675, 1776],
             ...                    'horse': [4, 25, 281, 600, 1900]},
             ...                   index=[1990, 1997, 2003, 2009, 2014])
             >>> df.plot.line(x='pig', y='horse')  # doctest: +SKIP
         """
         return self(kind="line", x=x, y=y, **kwargs)

     def bar(self, x=None, y=None, **kwds):
         """
         Vertical bar plot.

         A bar plot is a plot that presents categorical data with rectangular
         bars with lengths proportional to the values that they represent. A
         bar plot shows comparisons among discrete categories. One axis of the
         plot shows the specific categories being compared, and the other axis
         represents a measured value.

         Parameters
         ----------
         x : label or position, optional
             Allows plotting of one column versus another.
             If not specified, the index of the DataFrame is used.
         y : label or position, optional
             Allows plotting of one column versus another.
             If not specified, all numerical columns are used.
         **kwds : optional
             Additional keyword arguments are documented in
             :meth:`pyspark.pandas.Series.plot` or
             :meth:`pyspark.pandas.DataFrame.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Examples
         --------
         Basic plot.

         For Series:

         .. plotly::

             >>> s = ps.Series([1, 3, 2])
             >>> s.plot.bar()  # doctest: +SKIP

         For DataFrame:

         .. plotly::

             >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
             >>> df.plot.bar(x='lab', y='val')  # doctest: +SKIP

         Plot a whole dataframe to a bar plot. Each column is stacked with a
         distinct color along the horizontal axis.

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.bar()  # doctest: +SKIP

         Instead of stacking, the figure can be split by column with plotly
         APIs.

         .. plotly::

             >>> from plotly.subplots import make_subplots
             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> fig = (make_subplots(rows=2, cols=1)
             ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
             ...        .add_trace(df.plot.bar(y='speed').data[0], row=1, col=1)
             ...        .add_trace(df.plot.bar(y='lifespan').data[0], row=2, col=1))
             >>> fig  # doctest: +SKIP

         Plot a single column.

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.bar(y='speed')  # doctest: +SKIP

         Plot only selected categories for the DataFrame.

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.bar(x='lifespan')  # doctest: +SKIP
         """
         from pyspark.pandas import DataFrame, Series

         if isinstance(self.data, Series):
             return self(kind="bar", **kwds)
         elif isinstance(self.data, DataFrame):
             return self(kind="bar", x=x, y=y, **kwds)

     def barh(self, x=None, y=None, **kwargs):
         """
         Make a horizontal bar plot.

         A horizontal bar plot is a plot that presents quantitative data with
         rectangular bars with lengths proportional to the values that they
         represent. A bar plot shows comparisons among discrete categories. One
         axis of the plot shows the specific categories being compared, and the
         other axis represents a measured value.

         Parameters
         ----------
         x : label or position, default All numeric columns in dataframe
             Columns to be plotted from the DataFrame.
         y : label or position, default DataFrame.index
             Column to be used for categories.
         **kwds
             Keyword arguments to pass on to
             :meth:`pyspark.pandas.DataFrame.plot` or :meth:`pyspark.pandas.Series.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Notes
         -----
         In Plotly and Matplotlib, the interpretation of `x` and `y` for `barh` plots differs.
         In Plotly, `x` refers to the values and `y` refers to the categories.
         In Matplotlib, `x` refers to the categories and `y` refers to the values.
         Ensure correct axis labeling based on the backend used.

         See Also
         --------
         plotly.express.bar : Plot a vertical bar plot using plotly.
         matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.

         Examples
         --------
         For Series:

         .. plotly::

             >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
             >>> df.val.plot.barh()  # doctest: +SKIP

         For DataFrame:

         .. plotly::

             >>> df = ps.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
             >>> df.plot.barh(x='lab', y='val')  # doctest: +SKIP

         Plot a whole DataFrame to a horizontal bar plot

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.barh()  # doctest: +SKIP

         Plot a column of the DataFrame to a horizontal bar plot

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.barh(y='speed')  # doctest: +SKIP

         Plot DataFrame versus the desired column

         .. plotly::

             >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
             >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
             >>> index = ['snail', 'pig', 'elephant',
             ...          'rabbit', 'giraffe', 'coyote', 'horse']
             >>> df = ps.DataFrame({'speed': speed,
             ...                    'lifespan': lifespan}, index=index)
             >>> df.plot.barh(x='lifespan')  # doctest: +SKIP
         """
         from pyspark.pandas import DataFrame, Series

         if isinstance(self.data, Series):
             return self(kind="barh", **kwargs)
         elif isinstance(self.data, DataFrame):
             return self(kind="barh", x=x, y=y, **kwargs)

     def box(self, **kwds):
         """
         Make a box plot of the DataFrame columns.

         A box plot is a method for graphically depicting groups of numerical data through
         their quartiles. The box extends from the Q1 to Q3 quartile values of the data,
         with a line at the median (Q2). The whiskers extend from the edges of box to show
         the range of the data. The position of the whiskers is set by default to
         1.5*IQR (IQR = Q3 - Q1) from the edges of the box. Outlier points are those past
         the end of the whiskers.

         A consideration when using this chart is that the box and the whiskers can overlap,
         which is very common when plotting small sets of data.

         Parameters
         ----------
         **kwds : dict, optional
             Extra arguments to `precision`: refer to a float that is used by
             pandas-on-Spark to compute approximate statistics for building a
             boxplot. The default value is 0.01. Use smaller values to get more
             precise statistics. Additional keyword arguments are documented in
             :meth:`pyspark.pandas.Series.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Notes
         -----
         There are behavior differences between pandas-on-Spark and pandas.

         * pandas-on-Spark computes approximate statistics - expect differences between
           pandas and pandas-on-Spark boxplots, especially regarding 1st and 3rd quartiles.
         * The `whis` argument is only supported as a single number.
         * pandas-on-Spark doesn't support the following argument(s) (matplotlib-only).

           * `bootstrap` argument is not supported
           * `autorange` argument is not supported

         Examples
         --------
         Draw a box plot from a DataFrame with four columns of randomly
         generated data.

         For Series:

         .. plotly::

             >>> data = np.random.randn(25, 4)
             >>> df = ps.DataFrame(data, columns=list('ABCD'))
             >>> df['A'].plot.box()  # doctest: +SKIP

         This is an unsupported function for DataFrame type
         """
         from pyspark.pandas import DataFrame, Series

         if isinstance(self.data, (Series, DataFrame)):
             return self(kind="box", **kwds)

     def hist(self, bins=10, **kwds):
         """
         Draw one histogram of the DataFrame’s columns.

         A `histogram`_ is a representation of the distribution of data.
         This function calls :meth:`plotting.backend.plot`,
         on each series in the DataFrame, resulting in one histogram per column.
         This is useful when the DataFrame’s Series are in a similar scale.

         .. _histogram: https://en.wikipedia.org/wiki/Histogram

         Parameters
         ----------
         bins : integer or sequence, default 10
             Number of histogram bins to be used. If an integer is given, bins + 1
             bin edges are calculated and returned. If bins is a sequence, it gives
             bin edges, including left edge of first bin and right edge of last
             bin. In this case, bins are returned unmodified.
         **kwds
             All other plotting keyword arguments to be passed to
             plotting backend.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Examples
         --------
         Basic plot.

         For Series:

         .. plotly::

             >>> s = ps.Series([1, 3, 2])
             >>> s.plot.hist()  # doctest: +SKIP

         For DataFrame:

         .. plotly::

             >>> df = pd.DataFrame(
             ...     np.random.randint(1, 7, 6000),
             ...     columns=['one'])
             >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
             >>> df = ps.from_pandas(df)
             >>> df.plot.hist(bins=12, alpha=0.5)  # doctest: +SKIP
         """
         return self(kind="hist", bins=bins, **kwds)

     def kde(self, bw_method=None, ind=None, **kwargs):
         """
         Generate Kernel Density Estimate plot using Gaussian kernels.

         In statistics, kernel density estimation (KDE) is a non-parametric way to
         estimate the probability density function (PDF) of a random variable. This
         function uses Gaussian kernels and includes automatic bandwidth determination.

         Parameters
         ----------
         bw_method : scalar
             The method used to calculate the estimator bandwidth.
             See KernelDensity in PySpark for more information.
         ind : NumPy array or integer, optional
             Evaluation points for the estimated PDF. If None (default),
             1000 equally spaced points are used. If `ind` is a NumPy array, the
             KDE is evaluated at the points passed. If `ind` is an integer,
             `ind` number of equally spaced points are used.
         **kwargs : optional
             Keyword arguments to pass on to :meth:`pandas-on-Spark.Series.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Examples
         --------
         A scalar bandwidth should be specified. Using a small bandwidth value can
         lead to over-fitting, while using a large bandwidth value may result
         in under-fitting:

         .. plotly::

             >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
             >>> s.plot.kde(bw_method=0.3, ind=100)  # doctest: +SKIP

         .. plotly::

             >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
             >>> s.plot.kde(bw_method=3, ind=100)  # doctest: +SKIP

         The `ind` parameter determines the evaluation points for the
         plot of the estimated KDF:

         .. plotly::

             >>> s = ps.Series([1, 2, 2.5, 3, 3.5, 4, 5])
             >>> s.plot.kde(ind=[1, 2, 3, 4, 5], bw_method=0.3)  # doctest: +SKIP

         For DataFrame, it works in the same way as Series:

         .. plotly::

             >>> df = ps.DataFrame({
             ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
             ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
             ... })
             >>> df.plot.kde(bw_method=0.3, ind=100)  # doctest: +SKIP

         .. plotly::

             >>> df = ps.DataFrame({
             ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
             ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
             ... })
             >>> df.plot.kde(bw_method=3, ind=100)  # doctest: +SKIP

         .. plotly::

             >>> df = ps.DataFrame({
             ...     'x': [1, 2, 2.5, 3, 3.5, 4, 5],
             ...     'y': [4, 4, 4.5, 5, 5.5, 6, 6],
             ... })
             >>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3)  # doctest: +SKIP
         """
         return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)

     density = kde

     def area(self, x=None, y=None, **kwds):
         """
         Draw a stacked area plot.

         An area plot displays quantitative data visually.
         This function wraps the plotly area function.

         Parameters
         ----------
         x : label or position, optional
             Coordinates for the X axis. By default it uses the index.
         y : label or position, optional
             Column to plot. By default it uses all columns.
         stacked : bool, default True
             Area plots are stacked by default. Set to False to create an
             unstacked plot (matplotlib-only).
         **kwds : optional
             Additional keyword arguments are documented in
             :meth:`DataFrame.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Examples
         --------

         For Series

         .. plotly::

             >>> df = ps.DataFrame({
             ...     'sales': [3, 2, 3, 9, 10, 6],
             ...     'signups': [5, 5, 6, 12, 14, 13],
             ...     'visits': [20, 42, 28, 62, 81, 50],
             ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
             ...                        freq='ME'))
             >>> df.sales.plot.area()  # doctest: +SKIP

         For DataFrame

         .. plotly::

             >>> df = ps.DataFrame({
             ...     'sales': [3, 2, 3, 9, 10, 6],
             ...     'signups': [5, 5, 6, 12, 14, 13],
             ...     'visits': [20, 42, 28, 62, 81, 50],
             ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
             ...                        freq='ME'))
             >>> df.plot.area()  # doctest: +SKIP
         """
         from pyspark.pandas import DataFrame, Series

         if isinstance(self.data, Series):
             return self(kind="area", **kwds)
         elif isinstance(self.data, DataFrame):
             return self(kind="area", x=x, y=y, **kwds)

     def pie(self, **kwds):
         """
         Generate a pie plot.

         A pie plot is a proportional representation of the numerical data in a
         column. This function wraps :meth:`plotly.express.pie` for the
         specified column.

         Parameters
         ----------
         y : int or label, optional
             Label or position of the column to plot.
             If not provided, ``subplots=True`` argument must be passed (matplotlib-only).
         **kwds
             Keyword arguments to pass on to :meth:`pandas-on-Spark.Series.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         Examples
         --------

         For Series:

         .. plotly::

             >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
             ...                    'radius': [2439.7, 6051.8, 6378.1]},
             ...                   index=['Mercury', 'Venus', 'Earth'])
             >>> df.mass.plot.pie()  # doctest: +SKIP


         For DataFrame:

         .. plotly::

             >>> df = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
             ...                    'radius': [2439.7, 6051.8, 6378.1]},
             ...                   index=['Mercury', 'Venus', 'Earth'])
             >>> df.plot.pie(y='mass')  # doctest: +SKIP
         """
         from pyspark.pandas import DataFrame, Series

         if isinstance(self.data, Series):
             return self(kind="pie", **kwds)
         else:
             # pandas will raise an error if y is None and subplots if not True
             if (
                 isinstance(self.data, DataFrame)
                 and kwds.get("y", None) is None
                 and not kwds.get("subplots", False)
             ):
                 raise ValueError(
                     "pie requires either y column or 'subplots=True' (matplotlib-only)"
                 )
             return self(kind="pie", **kwds)

     def scatter(self, x, y, **kwds):
         """
         Create a scatter plot with varying marker point size and color.

         The coordinates of each point are defined by two dataframe columns and
         filled circles are used to represent each point. This kind of plot is
         useful to see complex correlations between two variables. Points could
         be for instance natural 2D coordinates like longitude and latitude in
         a map or, in general, any pair of metrics that can be plotted against
         each other.

         Parameters
         ----------
         x : int or str
             The column name or column position to be used as horizontal
             coordinates for each point.
         y : int or str
             The column name or column position to be used as vertical
             coordinates for each point.
         s : scalar or array_like, optional
             (matplotlib-only).
         c : str, int or array_like, optional
             (matplotlib-only).

         **kwds: Optional
             Keyword arguments to pass on to :meth:`pyspark.pandas.DataFrame.plot`.

         Returns
         -------
         :class:`plotly.graph_objs.Figure`
             Return an custom object when ``backend!=plotly``.
             Return an ndarray when ``subplots=True`` (matplotlib-only).

         See Also
         --------
         plotly.express.scatter : Scatter plot using multiple input data
             formats (plotly).
         matplotlib.pyplot.scatter : Scatter plot using multiple input data
             formats (matplotlib).

         Examples
         --------
         Let's see how to draw a scatter plot using coordinates from the values
         in a DataFrame's columns.

         .. plotly::

             >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
             ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
             ...                   columns=['length', 'width', 'species'])
             >>> df.plot.scatter(x='length', y='width')  # doctest: +SKIP

         And now with dark scheme:

         .. plotly::

             >>> df = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
             ...                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
             ...                   columns=['length', 'width', 'species'])
             >>> fig = df.plot.scatter(x='length', y='width')
             >>> fig.update_layout(template="plotly_dark")  # doctest: +SKIP
         """
         return self(kind="scatter", x=x, y=y, **kwds)

     def hexbin(self, **kwds):
         return unsupported_function(class_name="pd.DataFrame", method_name="hexbin")()