python/pyspark/pandas/plot/matplotlib.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 from typing import final

 from pyspark.loose_version import LooseVersion

 import matplotlib as mat
 import numpy as np
 from matplotlib.axes._base import _process_plot_format  # type: ignore[attr-defined]
 from matplotlib.figure import Figure
 from pandas.core.dtypes.inference import is_list_like
 from pandas.io.formats.printing import pprint_thing
 from pandas.plotting._matplotlib import (  # type: ignore[attr-defined]
     BarPlot as PandasBarPlot,
     BoxPlot as PandasBoxPlot,
     HistPlot as PandasHistPlot,
     PiePlot as PandasPiePlot,
     AreaPlot as PandasAreaPlot,
     LinePlot as PandasLinePlot,
     BarhPlot as PandasBarhPlot,
     ScatterPlot as PandasScatterPlot,
     KdePlot as PandasKdePlot,
 )
 from pandas.plotting._core import PlotAccessor
 from pandas.plotting._matplotlib.core import MPLPlot as PandasMPLPlot

 from pyspark.pandas.plot import (
     TopNPlotBase,
     SampledPlotBase,
     HistogramPlotBase,
     BoxPlotBase,
     unsupported_function,
     KdePlotBase,
 )
 from pyspark.pandas.series import Series, first_series

 _all_kinds = PlotAccessor._all_kinds  # type: ignore[attr-defined]


 def _set_ticklabels(ax, labels, is_vertical, **kwargs) -> None:
     """Set the tick labels of a given axis.

     Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the
     case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of
     labels.
     """
     ticks = ax.get_xticks() if is_vertical else ax.get_yticks()
     if len(ticks) != len(labels):
         i, remainder = divmod(len(ticks), len(labels))
         assert remainder == 0, remainder
         labels *= i
     if is_vertical:
         ax.set_xticklabels(labels, **kwargs)
     else:
         ax.set_yticklabels(labels, **kwargs)


 class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase):
     _kind = "bar"

     def __init__(self, data, **kwargs):
         super().__init__(self.get_top_n(data), **kwargs)

     def _plot(self, ax, x, y, w, start=0, log=False, **kwds):
         self.set_result_text(ax)
         return ax.bar(x, y, w, bottom=start, log=log, **kwds)


 class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase):
     _kind = "box"

     def boxplot(
         self,
         ax,
         bxpstats,
         notch=None,
         sym=None,
         vert=None,
         whis=None,
         positions=None,
         widths=None,
         patch_artist=None,
         bootstrap=None,
         usermedians=None,
         conf_intervals=None,
         meanline=None,
         showmeans=None,
         showcaps=None,
         showbox=None,
         showfliers=None,
         boxprops=None,
         labels=None,
         flierprops=None,
         medianprops=None,
         meanprops=None,
         capprops=None,
         whiskerprops=None,
         manage_ticks=None,
         # manage_xticks is for compatibility of matplotlib < 3.1.0.
         # Remove this when minimum version is 3.0.0
         manage_xticks=None,
         autorange=False,
         zorder=None,
         precision=None,
     ):
         def update_dict(dictionary, rc_name, properties):
             """Loads properties in the dictionary from rc file if not already
             in the dictionary"""
             rc_str = "boxplot.{0}.{1}"
             if dictionary is None:
                 dictionary = dict()
             for prop_dict in properties:
                 dictionary.setdefault(prop_dict, mat.rcParams[rc_str.format(rc_name, prop_dict)])
             return dictionary

         # Common property dictionaries loading from rc
         flier_props = [
             "color",
             "marker",
             "markerfacecolor",
             "markeredgecolor",
             "markersize",
             "linestyle",
             "linewidth",
         ]
         default_props = ["color", "linewidth", "linestyle"]

         boxprops = update_dict(boxprops, "boxprops", default_props)
         whiskerprops = update_dict(whiskerprops, "whiskerprops", default_props)
         capprops = update_dict(capprops, "capprops", default_props)
         medianprops = update_dict(medianprops, "medianprops", default_props)
         meanprops = update_dict(meanprops, "meanprops", default_props)
         flierprops = update_dict(flierprops, "flierprops", flier_props)

         if patch_artist:
             boxprops["linestyle"] = "solid"
             boxprops["edgecolor"] = boxprops.pop("color")

         # if non-default sym value, put it into the flier dictionary
         # the logic for providing the default symbol ('b+') now lives
         # in bxp in the initial value of final_flierprops
         # handle all of the `sym` related logic here so we only have to pass
         # on the flierprops dict.
         if sym is not None:
             # no-flier case, which should really be done with
             # 'showfliers=False' but none-the-less deal with it to keep back
             # compatibility
             if sym == "":
                 # blow away existing dict and make one for invisible markers
                 flierprops = dict(linestyle="none", marker="", color="none")
                 # turn the fliers off just to be safe
                 showfliers = False
             # now process the symbol string
             else:
                 # process the symbol string
                 # discarded linestyle
                 _, marker, color = _process_plot_format(sym)
                 # if we have a marker, use it
                 if marker is not None:
                     flierprops["marker"] = marker
                 # if we have a color, use it
                 if color is not None:
                     # assume that if color is passed in the user want
                     # filled symbol, if the users want more control use
                     # flierprops
                     flierprops["color"] = color
                     flierprops["markerfacecolor"] = color
                     flierprops["markeredgecolor"] = color

         # replace medians if necessary:
         if usermedians is not None:
             if len(np.ravel(usermedians)) != len(bxpstats) or np.shape(usermedians)[0] != len(
                 bxpstats
             ):
                 raise ValueError("usermedians length not compatible with x")
             else:
                 # reassign medians as necessary
                 for stats, med in zip(bxpstats, usermedians):
                     if med is not None:
                         stats["med"] = med

         if conf_intervals is not None:
             if np.shape(conf_intervals)[0] != len(bxpstats):
                 err_mess = "conf_intervals length not compatible with x"
                 raise ValueError(err_mess)
             else:
                 for stats, ci in zip(bxpstats, conf_intervals):
                     if ci is not None:
                         if len(ci) != 2:
                             raise ValueError("each confidence interval must " "have two values")
                         else:
                             if ci[0] is not None:
                                 stats["cilo"] = ci[0]
                             if ci[1] is not None:
                                 stats["cihi"] = ci[1]

         should_manage_ticks = True
         if manage_xticks is not None:
             should_manage_ticks = manage_xticks
         if manage_ticks is not None:
             should_manage_ticks = manage_ticks

         if LooseVersion(mat.__version__) < LooseVersion("3.1.0"):
             extra_args = {"manage_xticks": should_manage_ticks}
         else:
             extra_args = {"manage_ticks": should_manage_ticks}

         artists = ax.bxp(
             bxpstats,
             positions=positions,
             widths=widths,
             vert=vert,
             patch_artist=patch_artist,
             shownotches=notch,
             showmeans=showmeans,
             showcaps=showcaps,
             showbox=showbox,
             boxprops=boxprops,
             flierprops=flierprops,
             medianprops=medianprops,
             meanprops=meanprops,
             meanline=meanline,
             showfliers=showfliers,
             capprops=capprops,
             whiskerprops=whiskerprops,
             zorder=zorder,
             **extra_args,
         )
         return artists

     def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds):
         bp = self.boxplot(ax, bxpstats, **kwds)

         if return_type == "dict":
             return bp, bp
         elif return_type == "both":
             return self.BP(ax=ax, lines=bp), bp
         else:
             return ax, bp

     @final
     def _ensure_frame(self, data):
         if isinstance(data, Series):
             label = self.label
             if label is None and data.name is None:
                 label = ""
             if label is None:
                 data = data.to_frame()
             else:
                 data = data.to_frame(name=label)
         return data

     def _compute_plot_data(self):
         data = self.data
         data = first_series(data) if not isinstance(data, Series) else data
         colname = data.name
         spark_column_name = data._internal.spark_column_name_for(data._column_label)

         # Updates all props with the rc defaults from matplotlib
         self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds))

         # Gets some important kwds
         showfliers = self.kwds.get("showfliers", False)
         whis = self.kwds.get("whis", 1.5)
         labels = self.kwds.get("labels", [colname])

         # This one is pandas-on-Spark specific to control precision for approx_percentile
         precision = self.kwds.get("precision", 0.01)

         results = BoxPlotBase.compute_box(
             data._psdf._internal.resolved_copy.spark_frame,
             [spark_column_name],
             whis,
             precision,
             showfliers,
         )
         assert len(results) == 1
         result = results[0]

         # Builds bxpstats dict
         stats = []
         item = {
             "mean": result["mean"],
             "med": result["med"],
             "q1": result["q1"],
             "q3": result["q3"],
             "whislo": result["lower_whisker"],
             "whishi": result["upper_whisker"],
             "fliers": result["fliers"] if result["fliers"] else [],
             "label": labels[0],
         }
         stats.append(item)

         self.data = {labels[0]: stats}

     def _make_plot(self, fig: Figure):
         bxpstats = list(self.data.values())[0]
         ax = self._get_ax(0)
         kwds = self.kwds.copy()

         for stats in bxpstats:
             if len(stats["fliers"]) > 1000:
                 stats["fliers"] = stats["fliers"][:1000]
                 ax.text(
                     1,
                     1,
                     "showing top 1,000 fliers only",
                     size=6,
                     ha="right",
                     va="bottom",
                     transform=ax.transAxes,
                 )

         ret, bp = self._plot(ax, bxpstats, column_num=0, return_type=self.return_type, **kwds)
         self.maybe_color_bp(bp)
         self._return_obj = ret

         labels = [lbl for lbl, _ in self.data.items()]
         labels = [pprint_thing(lbl) for lbl in labels]
         if not self.use_index:
             labels = [pprint_thing(key) for key in range(len(labels))]
         _set_ticklabels(ax, labels, self.orientation == "vertical")

     @staticmethod
     def rc_defaults(
         notch=None,
         vert=None,
         whis=None,
         patch_artist=None,
         bootstrap=None,
         meanline=None,
         showmeans=None,
         showcaps=None,
         showbox=None,
         showfliers=None,
         **kwargs,
     ):
         # Missing arguments default to rcParams.
         if whis is None:
             whis = mat.rcParams["boxplot.whiskers"]
         if bootstrap is None:
             bootstrap = mat.rcParams["boxplot.bootstrap"]

         if notch is None:
             notch = mat.rcParams["boxplot.notch"]
         if vert is None:
             vert = mat.rcParams["boxplot.vertical"]
         if patch_artist is None:
             patch_artist = mat.rcParams["boxplot.patchartist"]
         if meanline is None:
             meanline = mat.rcParams["boxplot.meanline"]
         if showmeans is None:
             showmeans = mat.rcParams["boxplot.showmeans"]
         if showcaps is None:
             showcaps = mat.rcParams["boxplot.showcaps"]
         if showbox is None:
             showbox = mat.rcParams["boxplot.showbox"]
         if showfliers is None:
             showfliers = mat.rcParams["boxplot.showfliers"]

         return dict(
             whis=whis,
             bootstrap=bootstrap,
             notch=notch,
             vert=vert,
             patch_artist=patch_artist,
             meanline=meanline,
             showmeans=showmeans,
             showcaps=showcaps,
             showbox=showbox,
             showfliers=showfliers,
         )


 class PandasOnSparkHistPlot(PandasHistPlot, HistogramPlotBase):
     _kind = "hist"

     def _args_adjust(self):
         if is_list_like(self.bottom):
             self.bottom = np.array(self.bottom)

     @final
     def _ensure_frame(self, data):
         if isinstance(data, Series):
             label = self.label
             if label is None and data.name is None:
                 label = ""
             if label is None:
                 data = data.to_frame()
             else:
                 data = data.to_frame(name=label)
         return data

     def _calculate_bins(self, data, bins):
         return bins

     def _compute_plot_data(self):
         self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins)

     def _make_plot_keywords(self, kwds, y):
         """merge BoxPlot/KdePlot properties to passed kwds"""
         # y is required for KdePlot
         kwds["bottom"] = self.bottom
         kwds["bins"] = self.bins
         return kwds

     def _make_plot(self, fig: Figure):
         # TODO: this logic is similar to KdePlot. Might have to deduplicate it.
         # 'num_colors' requires to calculate `shape` which has to count all.
         # Use 1 for now to save the computation.
         colors = self._get_colors(num_colors=1)
         stacking_id = self._get_stacking_id()
         output_series = HistogramPlotBase.compute_hist(self.data, self.bins)

         for (i, label), y in zip(enumerate(self.data._internal.column_labels), output_series):
             ax = self._get_ax(i)

             kwds = self.kwds.copy()

             label = pprint_thing(label if len(label) > 1 else label[0])
             # `if hasattr(...)` makes plotting compatible with pandas < 1.3,
             # see pandas-dev/pandas#40078.
             label = (
                 self._mark_right_label(label, index=i)
                 if hasattr(self, "_mark_right_label")
                 else label
             )
             kwds["label"] = label

             style, kwds = self._apply_style_colors(colors, kwds, i, label)
             if style is not None:
                 kwds["style"] = style

             kwds = self._make_plot_keywords(kwds, y)
             artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
             # `if hasattr(...)` makes plotting compatible with pandas < 1.3,
             # see pandas-dev/pandas#40078.
             self._append_legend_handles_labels(artists[0], label) if hasattr(
                 self, "_append_legend_handles_labels"
             ) else self._add_legend_handle(artists[0], label, index=i)

     @classmethod
     def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds):
         if column_num == 0:
             cls._initialize_stacker(ax, stacking_id, len(bins) - 1)

         base = np.zeros(len(bins) - 1)
         bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])

         # Since the counts were computed already, we use them as weights and just generate
         # one entry for each bin
         n, bins, patches = ax.hist(bins[:-1], bins=bins, bottom=bottom, weights=y, **kwds)

         cls._update_stacker(ax, stacking_id, n)
         return patches


 class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase):
     _kind = "pie"

     def __init__(self, data, **kwargs):
         super().__init__(self.get_top_n(data), **kwargs)

     def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
         super()._make_plot(fig)


 class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase):
     _kind = "area"

     def __init__(self, data, **kwargs):
         super().__init__(self.get_sampled(data), **kwargs)

     def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
         super()._make_plot(fig)


 class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase):
     _kind = "line"

     def __init__(self, data, **kwargs):
         super().__init__(self.get_sampled(data), **kwargs)

     def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
         super()._make_plot(fig)


 class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase):
     _kind = "barh"

     def __init__(self, data, **kwargs):
         super().__init__(self.get_top_n(data), **kwargs)

     def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
         super()._make_plot(fig)


 class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase):
     _kind = "scatter"

     def __init__(self, data, x, y, **kwargs):
         super().__init__(self.get_top_n(data), x, y, **kwargs)

     def _make_plot(self, fig: Figure):
         self.set_result_text(self._get_ax(0))
         super()._make_plot(fig)


 class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase):
     _kind = "kde"

     def _compute_plot_data(self):
         self.data = KdePlotBase.prepare_kde_data(self.data)

     def _make_plot_keywords(self, kwds, y):
         kwds["bw_method"] = self.bw_method
         kwds["ind"] = type(self)._get_ind(y, ind=self.ind)
         return kwds

     def _make_plot(self, fig: Figure):
         # 'num_colors' requires to calculate `shape` which has to count all.
         # Use 1 for now to save the computation.
         colors = self._get_colors(num_colors=1)
         stacking_id = self._get_stacking_id()

         sdf = self.data._internal.spark_frame

         for i, label in enumerate(self.data._internal.column_labels):
             # 'y' is a Spark DataFrame that selects one column.
             y = sdf.select(self.data._internal.spark_column_for(label))
             ax = self._get_ax(i)

             kwds = self.kwds.copy()

             label = pprint_thing(label if len(label) > 1 else label[0])
             # `if hasattr(...)` makes plotting compatible with pandas < 1.3,
             # see pandas-dev/pandas#40078.
             label = (
                 self._mark_right_label(label, index=i)
                 if hasattr(self, "_mark_right_label")
                 else label
             )
             kwds["label"] = label

             style, kwds = self._apply_style_colors(colors, kwds, i, label)
             if style is not None:
                 kwds["style"] = style

             kwds = self._make_plot_keywords(kwds, y)
             artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
             # `if hasattr(...)` makes plotting compatible with pandas < 1.3,
             # see pandas-dev/pandas#40078.
             self._append_legend_handles_labels(artists[0], label) if hasattr(
                 self, "_append_legend_handles_labels"
             ) else self._add_legend_handle(artists[0], label, index=i)

     @staticmethod
     def _get_ind(y, ind):
         return KdePlotBase.get_ind(y, ind)

     @classmethod
     def _plot(
         cls, ax, y, style=None, bw_method=None, ind=None, column_num=None, stacking_id=None, **kwds
     ):
         y = KdePlotBase.compute_kde(y, bw_method=bw_method, ind=ind)
         lines = PandasMPLPlot._plot(ax, ind, y, style=style, **kwds)
         return lines


 _klasses = [
     PandasOnSparkHistPlot,
     PandasOnSparkBarPlot,
     PandasOnSparkBoxPlot,
     PandasOnSparkPiePlot,
     PandasOnSparkAreaPlot,
     PandasOnSparkLinePlot,
     PandasOnSparkBarhPlot,
     PandasOnSparkScatterPlot,
     PandasOnSparkKdePlot,
 ]
 _plot_klass = {getattr(klass, "_kind"): klass for klass in _klasses}
 _common_kinds = {"area", "bar", "barh", "box", "hist", "kde", "line", "pie"}
 _series_kinds = _common_kinds.union(set())
 _dataframe_kinds = _common_kinds.union({"scatter", "hexbin"})
 _pandas_on_spark_all_kinds = _common_kinds.union(_series_kinds).union(_dataframe_kinds)


 def plot_pandas_on_spark(data, kind, **kwargs):
     if kind not in _pandas_on_spark_all_kinds:
         raise ValueError("{} is not a valid plot kind".format(kind))

     from pyspark.pandas import DataFrame, Series

     if isinstance(data, Series):
         if kind not in _series_kinds:
             return unsupported_function(class_name="pd.Series", method_name=kind)()
         return plot_series(data=data, kind=kind, **kwargs)
     elif isinstance(data, DataFrame):
         if kind not in _dataframe_kinds:
             return unsupported_function(class_name="pd.DataFrame", method_name=kind)()
         return plot_frame(data=data, kind=kind, **kwargs)


 def plot_series(
     data,
     kind="line",
     ax=None,  # Series unique
     figsize=None,
     use_index=True,
     title=None,
     grid=None,
     legend=False,
     style=None,
     logx=False,
     logy=False,
     loglog=False,
     xticks=None,
     yticks=None,
     xlim=None,
     ylim=None,
     rot=None,
     fontsize=None,
     colormap=None,
     table=False,
     yerr=None,
     xerr=None,
     label=None,
     secondary_y=False,  # Series unique
     **kwds,
 ):
     """
     Make plots of Series using matplotlib / pylab.

     Each plot kind has a corresponding method on the
     ``Series.plot`` accessor:
     ``s.plot(kind='line')`` is equivalent to
     ``s.plot.line()``.

     Parameters
     ----------
     data : Series

     kind : str
         - 'line' : line plot (default)
         - 'bar' : vertical bar plot
         - 'barh' : horizontal bar plot
         - 'hist' : histogram
         - 'box' : boxplot
         - 'kde' : Kernel Density Estimation plot
         - 'density' : same as 'kde'
         - 'area' : area plot
         - 'pie' : pie plot

     ax : matplotlib axes object
         If not passed, uses gca()
     figsize : a tuple (width, height) in inches
     use_index : boolean, default True
         Use index as ticks for x axis
     title : string or list
         Title to use for the plot. If a string is passed, print the string at
         the top of the figure. If a list is passed and `subplots` is True,
         print each item in the list above the corresponding subplot.
     grid : boolean, default None (matlab style default)
         Axis grid lines
     legend : False/True/'reverse'
         Place legend on axis subplots
     style : list or dict
         matplotlib line style per column
     logx : boolean, default False
         Use log scaling on x axis
     logy : boolean, default False
         Use log scaling on y axis
     loglog : boolean, default False
         Use log scaling on both x and y axes
     xticks : sequence
         Values to use for the xticks
     yticks : sequence
         Values to use for the yticks
     xlim : 2-tuple/list
     ylim : 2-tuple/list
     rot : int, default None
         Rotation for ticks (xticks for vertical, yticks for horizontal plots)
     fontsize : int, default None
         Font size for xticks and yticks
     colormap : str or matplotlib colormap object, default None
         Colormap to select colors from. If string, load colormap with that name
         from matplotlib.
     colorbar : boolean, optional
         If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
     position : float
         Specify relative alignments for bar plot layout.
         From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
     table : boolean, Series or DataFrame, default False
         If True, draw a table using the data in the DataFrame and the data will
         be transposed to meet matplotlib's default layout.
         If a Series or DataFrame is passed, use passed data to draw a table.
     yerr : DataFrame, Series, array-like, dict and str
         See :ref:`Plotting with Error Bars <visualization.errorbars>` for
         detail.
     xerr : same types as yerr.
     label : label argument to provide to plot
     secondary_y : boolean or sequence of ints, default False
         If True then y-axis will be on the right
     mark_right : boolean, default True
         When using a secondary_y axis, automatically mark the column
         labels with "(right)" in the legend
     **kwds : keywords
         Options to pass to matplotlib plotting method

     Returns
     -------
     axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them

     Notes
     -----

     - See matplotlib documentation online for more on this subject
     - If `kind` = 'bar' or 'barh', you can specify relative alignments
       for bar plot layout by `position` keyword.
       From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
     """

     # function copied from pandas.plotting._core
     # so it calls modified _plot below

     import matplotlib.pyplot as plt

     if ax is None and len(plt.get_fignums()) > 0:
         with plt.rc_context():
             ax = plt.gca()
         ax = PandasMPLPlot._get_ax_layer(ax)
     return _plot(
         data,
         kind=kind,
         ax=ax,
         figsize=figsize,
         use_index=use_index,
         title=title,
         grid=grid,
         legend=legend,
         style=style,
         logx=logx,
         logy=logy,
         loglog=loglog,
         xticks=xticks,
         yticks=yticks,
         xlim=xlim,
         ylim=ylim,
         rot=rot,
         fontsize=fontsize,
         colormap=colormap,
         table=table,
         yerr=yerr,
         xerr=xerr,
         label=label,
         secondary_y=secondary_y,
         **kwds,
     )


 def plot_frame(
     data,
     x=None,
     y=None,
     kind="line",
     ax=None,
     subplots=False,
     sharex=None,
     sharey=False,
     layout=None,
     figsize=None,
     use_index=True,
     title=None,
     grid=None,
     legend=True,
     style=None,
     logx=False,
     logy=False,
     loglog=False,
     xticks=None,
     yticks=None,
     xlim=None,
     ylim=None,
     rot=None,
     fontsize=None,
     colormap=None,
     table=False,
     yerr=None,
     xerr=None,
     secondary_y=False,
     **kwds,
 ):
     """
     Make plots of DataFrames using matplotlib / pylab.

     Each plot kind has a corresponding method on the
     ``DataFrame.plot`` accessor:
     ``psdf.plot(kind='line')`` is equivalent to
     ``psdf.plot.line()``.

     Parameters
     ----------
     data : DataFrame

     kind : str
         - 'line' : line plot (default)
         - 'bar' : vertical bar plot
         - 'barh' : horizontal bar plot
         - 'hist' : histogram
         - 'box' : boxplot
         - 'kde' : Kernel Density Estimation plot
         - 'density' : same as 'kde'
         - 'area' : area plot
         - 'pie' : pie plot
         - 'scatter' : scatter plot
     ax : matplotlib axes object
         If not passed, uses gca()
     x : label or position, default None
     y : label, position or list of label, positions, default None
         Allows plotting of one column versus another.
     figsize : a tuple (width, height) in inches
     use_index : boolean, default True
         Use index as ticks for x axis
     title : string or list
         Title to use for the plot. If a string is passed, print the string at
         the top of the figure. If a list is passed and `subplots` is True,
         print each item in the list above the corresponding subplot.
     grid : boolean, default None (matlab style default)
         Axis grid lines
     legend : False/True/'reverse'
         Place legend on axis subplots
     style : list or dict
         matplotlib line style per column
     logx : boolean, default False
         Use log scaling on x axis
     logy : boolean, default False
         Use log scaling on y axis
     loglog : boolean, default False
         Use log scaling on both x and y axes
     xticks : sequence
         Values to use for the xticks
     yticks : sequence
         Values to use for the yticks
     xlim : 2-tuple/list
     ylim : 2-tuple/list
     sharex: bool or None, default is None
         Whether to share x axis or not.
     sharey: bool, default is False
         Whether to share y axis or not.
     rot : int, default None
         Rotation for ticks (xticks for vertical, yticks for horizontal plots)
     fontsize : int, default None
         Font size for xticks and yticks
     colormap : str or matplotlib colormap object, default None
         Colormap to select colors from. If string, load colormap with that name
         from matplotlib.
     colorbar : boolean, optional
         If True, plot colorbar (only relevant for 'scatter' and 'hexbin' plots)
     position : float
         Specify relative alignments for bar plot layout.
         From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
     table : boolean, Series or DataFrame, default False
         If True, draw a table using the data in the DataFrame and the data will
         be transposed to meet matplotlib's default layout.
         If a Series or DataFrame is passed, use passed data to draw a table.
     yerr : DataFrame, Series, array-like, dict and str
         See :ref:`Plotting with Error Bars <visualization.errorbars>` for
         detail.
     xerr : same types as yerr.
     label : label argument to provide to plot
     secondary_y : boolean or sequence of ints, default False
         If True then y-axis will be on the right
     mark_right : boolean, default True
         When using a secondary_y axis, automatically mark the column
         labels with "(right)" in the legend
     **kwds : keywords
         Options to pass to matplotlib plotting method

     Returns
     -------
     axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them

     Notes
     -----

     - See matplotlib documentation online for more on this subject
     - If `kind` = 'bar' or 'barh', you can specify relative alignments
       for bar plot layout by `position` keyword.
       From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center)
     """
     return _plot(
         data,
         kind=kind,
         x=x,
         y=y,
         ax=ax,
         figsize=figsize,
         use_index=use_index,
         title=title,
         grid=grid,
         legend=legend,
         subplots=subplots,
         style=style,
         logx=logx,
         logy=logy,
         loglog=loglog,
         xticks=xticks,
         yticks=yticks,
         xlim=xlim,
         ylim=ylim,
         rot=rot,
         fontsize=fontsize,
         colormap=colormap,
         table=table,
         yerr=yerr,
         xerr=xerr,
         sharex=sharex,
         sharey=sharey,
         secondary_y=secondary_y,
         layout=layout,
         **kwds,
     )


 def _plot(data, x=None, y=None, subplots=False, ax=None, kind="line", **kwds):
     from pyspark.pandas import DataFrame

     # function copied from pandas.plotting._core
     # and adapted to handle pandas-on-Spark DataFrame and Series

     kind = kind.lower().strip()
     kind = {"density": "kde"}.get(kind, kind)
     if kind in _all_kinds:
         klass = _plot_klass[kind]
     else:
         raise ValueError("%r is not a valid plot kind" % kind)

     # scatter and hexbin are inherited from PlanePlot which require x and y
     if kind in ("scatter", "hexbin"):
         plot_obj = klass(data, x, y, subplots=subplots, ax=ax, kind=kind, **kwds)
     else:
         # check data type and do preprocess before applying plot
         if isinstance(data, DataFrame):
             if x is not None:
                 data = data.set_index(x)
             # TODO: check if value of y is plottable
             if y is not None:
                 data = data[y]

         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
     plot_obj.generate()
     plot_obj.draw()
     return plot_obj.result