| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| """ |
| A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. |
| """ |
| from collections import OrderedDict, defaultdict, namedtuple |
| from collections.abc import Mapping |
| from distutils.version import LooseVersion |
| import re |
| import warnings |
| import inspect |
| import json |
| import types |
| from functools import partial, reduce |
| import sys |
| from itertools import zip_longest |
| from types import TracebackType |
| from typing import ( |
| Any, |
| Callable, |
| Dict, |
| Generic, |
| IO, |
| Iterable, |
| Iterator, |
| List, |
| Optional, |
| Sequence, |
| Tuple, |
| Type, |
| Union, |
| cast, |
| no_type_check, |
| TYPE_CHECKING, |
| ) |
| import datetime |
| |
| import numpy as np |
| import pandas as pd |
| from pandas.api.types import is_list_like, is_dict_like, is_scalar |
| from pandas.api.extensions import ExtensionDtype |
| from pandas.tseries.frequencies import DateOffset, to_offset |
| |
| if TYPE_CHECKING: |
| from pandas.io.formats.style import Styler # noqa: F401 (SPARK-34943) |
| |
| if LooseVersion(pd.__version__) >= LooseVersion("0.24"): |
| from pandas.core.dtypes.common import infer_dtype_from_object |
| else: |
| from pandas.core.dtypes.common import _get_dtype_from_object as infer_dtype_from_object |
| from pandas.core.accessor import CachedAccessor |
| from pandas.core.dtypes.inference import is_sequence |
| from pyspark import StorageLevel |
| from pyspark.sql import Column, DataFrame as SparkDataFrame, functions as F |
| from pyspark.sql.functions import pandas_udf |
| from pyspark.sql.types import ( # noqa: F401 (SPARK-34943) |
| ArrayType, |
| BooleanType, |
| DataType, |
| DoubleType, |
| FloatType, |
| NumericType, |
| Row, |
| StringType, |
| StructField, |
| StructType, |
| ) |
| from pyspark.sql.window import Window |
| |
| from pyspark import pandas as ps # For running doctests and reference resolution in PyCharm. |
| from pyspark.pandas._typing import Axis, DataFrameOrSeries, Dtype, Label, Name, Scalar, T |
| from pyspark.pandas.accessors import PandasOnSparkFrameMethods |
| from pyspark.pandas.config import option_context, get_option |
| from pyspark.pandas.spark import functions as SF |
| from pyspark.pandas.spark.accessors import SparkFrameMethods, CachedSparkFrameMethods |
| from pyspark.pandas.utils import ( |
| align_diff_frames, |
| column_labels_level, |
| combine_frames, |
| default_session, |
| is_name_like_tuple, |
| is_name_like_value, |
| is_testing, |
| name_like_string, |
| same_anchor, |
| scol_for, |
| validate_arguments_and_invoke_function, |
| validate_axis, |
| validate_bool_kwarg, |
| validate_how, |
| validate_mode, |
| verify_temp_column_name, |
| ) |
| from pyspark.pandas.generic import Frame |
| from pyspark.pandas.internal import ( |
| InternalField, |
| InternalFrame, |
| HIDDEN_COLUMNS, |
| NATURAL_ORDER_COLUMN_NAME, |
| SPARK_INDEX_NAME_FORMAT, |
| SPARK_DEFAULT_INDEX_NAME, |
| SPARK_DEFAULT_SERIES_NAME, |
| ) |
| from pyspark.pandas.missing.frame import _MissingPandasLikeDataFrame |
| from pyspark.pandas.ml import corr |
| from pyspark.pandas.typedef.typehints import ( |
| as_spark_type, |
| infer_return_type, |
| pandas_on_spark_type, |
| spark_type_to_pandas_dtype, |
| DataFrameType, |
| SeriesType, |
| ScalarType, |
| ) |
| from pyspark.pandas.plot import PandasOnSparkPlotAccessor |
| |
| if TYPE_CHECKING: |
| from pyspark.sql._typing import OptionalPrimitiveType # noqa: F401 (SPARK-34943) |
| |
| from pyspark.pandas.groupby import DataFrameGroupBy # noqa: F401 (SPARK-34943) |
| from pyspark.pandas.indexes import Index # noqa: F401 (SPARK-34943) |
| from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943) |
| |
| |
| # These regular expression patterns are complied and defined here to avoid to compile the same |
| # pattern every time it is used in _repr_ and _repr_html_ in DataFrame. |
| # Two patterns basically seek the footer string from Pandas' |
| REPR_PATTERN = re.compile(r"\n\n\[(?P<rows>[0-9]+) rows x (?P<columns>[0-9]+) columns\]$") |
| REPR_HTML_PATTERN = re.compile( |
| r"\n\<p\>(?P<rows>[0-9]+) rows × (?P<columns>[0-9]+) columns\<\/p\>\n\<\/div\>$" |
| ) |
| |
| |
| _flex_doc_FRAME = """ |
| Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). |
| |
| Equivalent to ``{equiv}``. With reverse version, `{reverse}`. |
| |
| Among flexible wrappers (`add`, `sub`, `mul`, `div`) to |
| arithmetic operators: `+`, `-`, `*`, `/`, `//`. |
| |
| Parameters |
| ---------- |
| other : scalar |
| Any single data |
| |
| Returns |
| ------- |
| DataFrame |
| Result of the arithmetic operation. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({{'angles': [0, 3, 4], |
| ... 'degrees': [360, 180, 360]}}, |
| ... index=['circle', 'triangle', 'rectangle'], |
| ... columns=['angles', 'degrees']) |
| >>> df |
| angles degrees |
| circle 0 360 |
| triangle 3 180 |
| rectangle 4 360 |
| |
| Add a scalar with operator version which return the same |
| results. Also reverse version. |
| |
| >>> df + 1 |
| angles degrees |
| circle 1 361 |
| triangle 4 181 |
| rectangle 5 361 |
| |
| >>> df.add(1) |
| angles degrees |
| circle 1 361 |
| triangle 4 181 |
| rectangle 5 361 |
| |
| >>> df.add(df) |
| angles degrees |
| circle 0 720 |
| triangle 6 360 |
| rectangle 8 720 |
| |
| >>> df + df + df |
| angles degrees |
| circle 0 1080 |
| triangle 9 540 |
| rectangle 12 1080 |
| |
| >>> df.radd(1) |
| angles degrees |
| circle 1 361 |
| triangle 4 181 |
| rectangle 5 361 |
| |
| Divide and true divide by constant with reverse version. |
| |
| >>> df / 10 |
| angles degrees |
| circle 0.0 36.0 |
| triangle 0.3 18.0 |
| rectangle 0.4 36.0 |
| |
| >>> df.div(10) |
| angles degrees |
| circle 0.0 36.0 |
| triangle 0.3 18.0 |
| rectangle 0.4 36.0 |
| |
| >>> df.rdiv(10) |
| angles degrees |
| circle inf 0.027778 |
| triangle 3.333333 0.055556 |
| rectangle 2.500000 0.027778 |
| |
| >>> df.truediv(10) |
| angles degrees |
| circle 0.0 36.0 |
| triangle 0.3 18.0 |
| rectangle 0.4 36.0 |
| |
| >>> df.rtruediv(10) |
| angles degrees |
| circle inf 0.027778 |
| triangle 3.333333 0.055556 |
| rectangle 2.500000 0.027778 |
| |
| Subtract by constant with reverse version. |
| |
| >>> df - 1 |
| angles degrees |
| circle -1 359 |
| triangle 2 179 |
| rectangle 3 359 |
| |
| >>> df.sub(1) |
| angles degrees |
| circle -1 359 |
| triangle 2 179 |
| rectangle 3 359 |
| |
| >>> df.rsub(1) |
| angles degrees |
| circle 1 -359 |
| triangle -2 -179 |
| rectangle -3 -359 |
| |
| Multiply by constant with reverse version. |
| |
| >>> df * 1 |
| angles degrees |
| circle 0 360 |
| triangle 3 180 |
| rectangle 4 360 |
| |
| >>> df.mul(1) |
| angles degrees |
| circle 0 360 |
| triangle 3 180 |
| rectangle 4 360 |
| |
| >>> df.rmul(1) |
| angles degrees |
| circle 0 360 |
| triangle 3 180 |
| rectangle 4 360 |
| |
| Floor Divide by constant with reverse version. |
| |
| >>> df // 10 |
| angles degrees |
| circle 0.0 36.0 |
| triangle 0.0 18.0 |
| rectangle 0.0 36.0 |
| |
| >>> df.floordiv(10) |
| angles degrees |
| circle 0.0 36.0 |
| triangle 0.0 18.0 |
| rectangle 0.0 36.0 |
| |
| >>> df.rfloordiv(10) # doctest: +SKIP |
| angles degrees |
| circle inf 0.0 |
| triangle 3.0 0.0 |
| rectangle 2.0 0.0 |
| |
| Mod by constant with reverse version. |
| |
| >>> df % 2 |
| angles degrees |
| circle 0 0 |
| triangle 1 0 |
| rectangle 0 0 |
| |
| >>> df.mod(2) |
| angles degrees |
| circle 0 0 |
| triangle 1 0 |
| rectangle 0 0 |
| |
| >>> df.rmod(2) |
| angles degrees |
| circle NaN 2 |
| triangle 2.0 2 |
| rectangle 2.0 2 |
| |
| Power by constant with reverse version. |
| |
| >>> df ** 2 |
| angles degrees |
| circle 0.0 129600.0 |
| triangle 9.0 32400.0 |
| rectangle 16.0 129600.0 |
| |
| >>> df.pow(2) |
| angles degrees |
| circle 0.0 129600.0 |
| triangle 9.0 32400.0 |
| rectangle 16.0 129600.0 |
| |
| >>> df.rpow(2) |
| angles degrees |
| circle 1.0 2.348543e+108 |
| triangle 8.0 1.532496e+54 |
| rectangle 16.0 2.348543e+108 |
| """ |
| |
| |
| def _create_tuple_for_frame_type(params: Any) -> object: |
| """ |
| This is a workaround to support variadic generic in DataFrame. |
| |
| See https://github.com/python/typing/issues/193 |
| we always wraps the given type hints by a tuple to mimic the variadic generic. |
| """ |
| from pyspark.pandas.typedef import NameTypeHolder |
| |
| if isinstance(params, zip): # type: ignore |
| params = [slice(name, tpe) for name, tpe in params] # type: ignore |
| |
| if isinstance(params, slice): |
| params = (params,) |
| |
| if ( |
| hasattr(params, "__len__") |
| and isinstance(params, Iterable) |
| and all(isinstance(param, slice) for param in params) |
| ): |
| for param in params: |
| if isinstance(param.start, str) and param.step is not None: |
| raise TypeError( |
| "Type hints should be specified as " |
| "DataFrame['name': type]; however, got %s" % param |
| ) |
| |
| name_classes = [] |
| for param in params: |
| new_class = type("NameType", (NameTypeHolder,), {}) # type: Type[NameTypeHolder] |
| new_class.name = param.start |
| # When the given argument is a numpy's dtype instance. |
| new_class.tpe = param.stop.type if isinstance(param.stop, np.dtype) else param.stop |
| name_classes.append(new_class) |
| |
| return Tuple[tuple(name_classes)] |
| |
| if not isinstance(params, Iterable): |
| params = [params] |
| |
| new_params = [] |
| for param in params: |
| if isinstance(param, ExtensionDtype): |
| new_class = type("NameType", (NameTypeHolder,), {}) |
| new_class.tpe = param |
| new_params.append(new_class) |
| else: |
| new_params.append(param.type if isinstance(param, np.dtype) else param) |
| return Tuple[tuple(new_params)] |
| |
| |
| if (3, 5) <= sys.version_info < (3, 7) and __name__ != "__main__": |
| from typing import GenericMeta # type: ignore |
| |
| # This is a workaround to support variadic generic in DataFrame in Python 3.5+. |
| # See https://github.com/python/typing/issues/193 |
| # We wrap the input params by a tuple to mimic variadic generic. |
| old_getitem = GenericMeta.__getitem__ # type: ignore |
| |
| @no_type_check |
| def new_getitem(self, params): |
| if hasattr(self, "is_dataframe"): |
| return old_getitem(self, _create_tuple_for_frame_type(params)) |
| else: |
| return old_getitem(self, params) |
| |
| GenericMeta.__getitem__ = new_getitem # type: ignore |
| |
| |
| class DataFrame(Frame, Generic[T]): |
| """ |
| pandas-on-Spark DataFrame that corresponds to pandas DataFrame logically. This holds Spark |
| DataFrame internally. |
| |
| :ivar _internal: an internal immutable Frame to manage metadata. |
| :type _internal: InternalFrame |
| |
| Parameters |
| ---------- |
| data : numpy ndarray (structured or homogeneous), dict, pandas DataFrame, Spark DataFrame \ |
| or pandas-on-Spark Series |
| Dict can contain Series, arrays, constants, or list-like objects |
| If data is a dict, argument order is maintained for Python 3.6 |
| and later. |
| Note that if `data` is a pandas DataFrame, a Spark DataFrame, and a pandas-on-Spark Series, |
| other arguments should not be used. |
| index : Index or array-like |
| Index to use for resulting frame. Will default to RangeIndex if |
| no indexing information part of input data and no index provided |
| columns : Index or array-like |
| Column labels to use for resulting frame. Will default to |
| RangeIndex (0, 1, 2, ..., n) if no column labels are provided |
| dtype : dtype, default None |
| Data type to force. Only a single dtype is allowed. If None, infer |
| copy : boolean, default False |
| Copy data from inputs. Only affects DataFrame / 2d ndarray input |
| |
| Examples |
| -------- |
| Constructing DataFrame from a dictionary. |
| |
| >>> d = {'col1': [1, 2], 'col2': [3, 4]} |
| >>> df = ps.DataFrame(data=d, columns=['col1', 'col2']) |
| >>> df |
| col1 col2 |
| 0 1 3 |
| 1 2 4 |
| |
| Constructing DataFrame from pandas DataFrame |
| |
| >>> df = ps.DataFrame(pd.DataFrame(data=d, columns=['col1', 'col2'])) |
| >>> df |
| col1 col2 |
| 0 1 3 |
| 1 2 4 |
| |
| Notice that the inferred dtype is int64. |
| |
| >>> df.dtypes |
| col1 int64 |
| col2 int64 |
| dtype: object |
| |
| To enforce a single dtype: |
| |
| >>> df = ps.DataFrame(data=d, dtype=np.int8) |
| >>> df.dtypes |
| col1 int8 |
| col2 int8 |
| dtype: object |
| |
| Constructing DataFrame from numpy ndarray: |
| |
| >>> df2 = ps.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)), |
| ... columns=['a', 'b', 'c', 'd', 'e']) |
| >>> df2 # doctest: +SKIP |
| a b c d e |
| 0 3 1 4 9 8 |
| 1 4 8 4 8 4 |
| 2 7 6 5 6 7 |
| 3 8 7 9 1 0 |
| 4 2 5 4 3 9 |
| """ |
| |
| @no_type_check |
| def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): |
| if isinstance(data, InternalFrame): |
| assert index is None |
| assert columns is None |
| assert dtype is None |
| assert not copy |
| internal = data |
| elif isinstance(data, SparkDataFrame): |
| assert index is None |
| assert columns is None |
| assert dtype is None |
| assert not copy |
| internal = InternalFrame(spark_frame=data, index_spark_columns=None) |
| elif isinstance(data, ps.Series): |
| assert index is None |
| assert columns is None |
| assert dtype is None |
| assert not copy |
| data = data.to_frame() |
| internal = data._internal |
| else: |
| if isinstance(data, pd.DataFrame): |
| assert index is None |
| assert columns is None |
| assert dtype is None |
| assert not copy |
| pdf = data |
| else: |
| pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy) |
| internal = InternalFrame.from_pandas(pdf) |
| |
| object.__setattr__(self, "_internal_frame", internal) |
| |
| @property |
| def _pssers(self) -> Dict[Label, "Series"]: |
| """Return a dict of column label -> Series which anchors `self`.""" |
| from pyspark.pandas.series import Series |
| |
| if not hasattr(self, "_psseries"): |
| object.__setattr__( |
| self, |
| "_psseries", |
| {label: Series(data=self, index=label) for label in self._internal.column_labels}, |
| ) |
| else: |
| psseries = self._psseries # type: ignore |
| assert len(self._internal.column_labels) == len(psseries), ( |
| len(self._internal.column_labels), |
| len(psseries), |
| ) |
| if any(self is not psser._psdf for psser in psseries.values()): |
| # Refresh the dict to contain only Series anchoring `self`. |
| self._psseries = { |
| label: psseries[label] |
| if self is psseries[label]._psdf |
| else Series(data=self, index=label) |
| for label in self._internal.column_labels |
| } |
| return self._psseries |
| |
| @property |
| def _internal(self) -> InternalFrame: |
| return self._internal_frame # type: ignore |
| |
| def _update_internal_frame( |
| self, internal: InternalFrame, requires_same_anchor: bool = True |
| ) -> None: |
| """ |
| Update InternalFrame with the given one. |
| |
| If the column_label is changed or the new InternalFrame is not the same `anchor`, |
| disconnect the link to the Series and create a new one. |
| |
| If `requires_same_anchor` is `False`, checking whether or not the same anchor is ignored |
| and force to update the InternalFrame, e.g., replacing the internal with the resolved_copy, |
| updating the underlying Spark DataFrame which need to combine a different Spark DataFrame. |
| |
| :param internal: the new InternalFrame |
| :param requires_same_anchor: whether checking the same anchor |
| """ |
| from pyspark.pandas.series import Series |
| |
| if hasattr(self, "_psseries"): |
| psseries = {} |
| |
| for old_label, new_label in zip_longest( |
| self._internal.column_labels, internal.column_labels |
| ): |
| if old_label is not None: |
| psser = self._pssers[old_label] |
| |
| renamed = old_label != new_label |
| not_same_anchor = requires_same_anchor and not same_anchor(internal, psser) |
| |
| if renamed or not_same_anchor: |
| psdf = DataFrame(self._internal.select_column(old_label)) # type: DataFrame |
| psser._update_anchor(psdf) |
| psser = None |
| else: |
| psser = None |
| if new_label is not None: |
| if psser is None: |
| psser = Series(data=self, index=new_label) |
| psseries[new_label] = psser |
| |
| self._psseries = psseries |
| |
| self._internal_frame = internal |
| |
| if hasattr(self, "_repr_pandas_cache"): |
| del self._repr_pandas_cache |
| |
| @property |
| def ndim(self) -> int: |
| """ |
| Return an int representing the number of array dimensions. |
| |
| return 2 for DataFrame. |
| |
| Examples |
| -------- |
| |
| >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]], |
| ... index=['cobra', 'viper', None], |
| ... columns=['max_speed', 'shield']) |
| >>> df |
| max_speed shield |
| cobra 1 2 |
| viper 4 5 |
| NaN 7 8 |
| >>> df.ndim |
| 2 |
| """ |
| return 2 |
| |
| @property |
| def axes(self) -> List: |
| """ |
| Return a list representing the axes of the DataFrame. |
| |
| It has the row axis labels and column axis labels as the only members. |
| They are returned in that order. |
| |
| Examples |
| -------- |
| |
| >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) |
| >>> df.axes |
| [Int64Index([0, 1], dtype='int64'), Index(['col1', 'col2'], dtype='object')] |
| """ |
| return [self.index, self.columns] |
| |
| def _reduce_for_stat_function( |
| self, |
| sfun: Union[Callable[[Column], Column], Callable[[Column, DataType], Column]], |
| name: str, |
| axis: Optional[Axis] = None, |
| numeric_only: bool = True, |
| **kwargs: Any |
| ) -> "Series": |
| """ |
| Applies sfun to each column and returns a pd.Series where the number of rows equal the |
| number of columns. |
| |
| Parameters |
| ---------- |
| sfun : either an 1-arg function that takes a Column and returns a Column, or |
| a 2-arg function that takes a Column and its DataType and returns a Column. |
| axis: used only for sanity check because series only support index axis. |
| name : original pandas API name. |
| axis : axis to apply. 0 or 1, or 'index' or 'columns. |
| numeric_only : bool, default True |
| Include only float, int, boolean columns. False is not supported. This parameter |
| is mainly for pandas compatibility. Only 'DataFrame.count' uses this parameter |
| currently. |
| """ |
| from inspect import signature |
| from pyspark.pandas.series import Series, first_series |
| |
| axis = validate_axis(axis) |
| if axis == 0: |
| min_count = kwargs.get("min_count", 0) |
| |
| exprs = [SF.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)] |
| new_column_labels = [] |
| num_args = len(signature(sfun).parameters) |
| for label in self._internal.column_labels: |
| spark_column = self._internal.spark_column_for(label) |
| spark_type = self._internal.spark_type_for(label) |
| |
| is_numeric_or_boolean = isinstance(spark_type, (NumericType, BooleanType)) |
| keep_column = not numeric_only or is_numeric_or_boolean |
| |
| if keep_column: |
| if num_args == 1: |
| # Only pass in the column if sfun accepts only one arg |
| scol = cast(Callable[[Column], Column], sfun)(spark_column) |
| else: # must be 2 |
| assert num_args == 2 |
| # Pass in both the column and its data type if sfun accepts two args |
| scol = cast(Callable[[Column, DataType], Column], sfun)( |
| spark_column, spark_type |
| ) |
| |
| if min_count > 0: |
| scol = F.when( |
| Frame._count_expr(spark_column, spark_type) >= min_count, scol |
| ) |
| |
| exprs.append(scol.alias(name_like_string(label))) |
| new_column_labels.append(label) |
| |
| if len(exprs) == 1: |
| return Series([]) |
| |
| sdf = self._internal.spark_frame.select(*exprs) |
| |
| # The data is expected to be small so it's fine to transpose/use default index. |
| with ps.option_context("compute.max_rows", 1): |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], |
| column_labels=new_column_labels, |
| column_label_names=self._internal.column_label_names, |
| ) |
| return first_series(DataFrame(internal).transpose()) |
| |
| else: |
| # Here we execute with the first 1000 to get the return type. |
| # If the records were less than 1000, it uses pandas API directly for a shortcut. |
| limit = get_option("compute.shortcut_limit") |
| pdf = self.head(limit + 1)._to_internal_pandas() |
| pser = getattr(pdf, name)(axis=axis, numeric_only=numeric_only, **kwargs) |
| if len(pdf) <= limit: |
| return Series(pser) |
| |
| @pandas_udf(returnType=as_spark_type(pser.dtype.type)) # type: ignore |
| def calculate_columns_axis(*cols: pd.Series) -> pd.Series: |
| return getattr(pd.concat(cols, axis=1), name)( |
| axis=axis, numeric_only=numeric_only, **kwargs |
| ) |
| |
| column_name = verify_temp_column_name( |
| self._internal.spark_frame.select(self._internal.index_spark_columns), |
| "__calculate_columns_axis__", |
| ) |
| sdf = self._internal.spark_frame.select( |
| self._internal.index_spark_columns |
| + [calculate_columns_axis(*self._internal.data_spark_columns).alias(column_name)] |
| ) |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, col) for col in self._internal.index_spark_column_names |
| ], |
| index_names=self._internal.index_names, |
| index_fields=self._internal.index_fields, |
| ) |
| return first_series(DataFrame(internal)).rename(pser.name) |
| |
| def _psser_for(self, label: Label) -> "Series": |
| """ |
| Create Series with a proper column label. |
| |
| The given label must be verified to exist in `InternalFrame.column_labels`. |
| |
| For example, in some method, self is like: |
| |
| >>> self = ps.range(3) |
| |
| `self._psser_for(label)` can be used with `InternalFrame.column_labels`: |
| |
| >>> self._psser_for(self._internal.column_labels[0]) |
| 0 0 |
| 1 1 |
| 2 2 |
| Name: id, dtype: int64 |
| |
| `self._psser_for(label)` must not be used directly with user inputs. |
| In that case, `self[label]` should be used instead, which checks the label exists or not: |
| |
| >>> self['id'] |
| 0 0 |
| 1 1 |
| 2 2 |
| Name: id, dtype: int64 |
| """ |
| return self._pssers[label] |
| |
| def _apply_series_op( |
| self, op: Callable[["Series"], Union["Series", Column]], should_resolve: bool = False |
| ) -> "DataFrame": |
| applied = [] |
| for label in self._internal.column_labels: |
| applied.append(op(self._psser_for(label))) |
| internal = self._internal.with_new_columns(applied) |
| if should_resolve: |
| internal = internal.resolved_copy |
| return DataFrame(internal) |
| |
| # Arithmetic Operators |
| def _map_series_op(self, op: str, other: Any) -> "DataFrame": |
| from pyspark.pandas.base import IndexOpsMixin |
| |
| if not isinstance(other, DataFrame) and ( |
| isinstance(other, IndexOpsMixin) or is_sequence(other) |
| ): |
| raise TypeError( |
| "%s with a sequence is currently not supported; " |
| "however, got %s." % (op, type(other).__name__) |
| ) |
| |
| if isinstance(other, DataFrame): |
| if self._internal.column_labels_level != other._internal.column_labels_level: |
| raise ValueError("cannot join with no overlapping index names") |
| |
| if not same_anchor(self, other): |
| # Different DataFrames |
| def apply_op( |
| psdf: DataFrame, |
| this_column_labels: List[Label], |
| that_column_labels: List[Label], |
| ) -> Iterator[Tuple["Series", Label]]: |
| for this_label, that_label in zip(this_column_labels, that_column_labels): |
| yield ( |
| getattr(psdf._psser_for(this_label), op)( |
| psdf._psser_for(that_label) |
| ).rename(this_label), |
| this_label, |
| ) |
| |
| return align_diff_frames(apply_op, self, other, fillna=True, how="full") |
| else: |
| applied = [] |
| column_labels = [] |
| for label in self._internal.column_labels: |
| if label in other._internal.column_labels: |
| applied.append(getattr(self._psser_for(label), op)(other._psser_for(label))) |
| else: |
| applied.append( |
| SF.lit(None) |
| .cast(self._internal.spark_type_for(label)) |
| .alias(name_like_string(label)) |
| ) |
| column_labels.append(label) |
| for label in other._internal.column_labels: |
| if label not in column_labels: |
| applied.append( |
| SF.lit(None) |
| .cast(other._internal.spark_type_for(label)) |
| .alias(name_like_string(label)) |
| ) |
| column_labels.append(label) |
| internal = self._internal.with_new_columns(applied, column_labels=column_labels) |
| return DataFrame(internal) |
| else: |
| return self._apply_series_op(lambda psser: getattr(psser, op)(other)) |
| |
| def __add__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("add", other) |
| |
| def __radd__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("radd", other) |
| |
| def __div__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("div", other) |
| |
| def __rdiv__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rdiv", other) |
| |
| def __truediv__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("truediv", other) |
| |
| def __rtruediv__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rtruediv", other) |
| |
| def __mul__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("mul", other) |
| |
| def __rmul__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rmul", other) |
| |
| def __sub__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("sub", other) |
| |
| def __rsub__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rsub", other) |
| |
| def __pow__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("pow", other) |
| |
| def __rpow__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rpow", other) |
| |
| def __mod__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("mod", other) |
| |
| def __rmod__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rmod", other) |
| |
| def __floordiv__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("floordiv", other) |
| |
| def __rfloordiv__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("rfloordiv", other) |
| |
| def __abs__(self) -> "DataFrame": |
| return self._apply_series_op(lambda psser: abs(psser)) |
| |
| def __neg__(self) -> "DataFrame": |
| return self._apply_series_op(lambda psser: -psser) |
| |
| def add(self, other: Any) -> "DataFrame": |
| return self + other |
| |
| # create accessor for plot |
| plot = CachedAccessor("plot", PandasOnSparkPlotAccessor) |
| |
| # create accessor for Spark related methods. |
| spark = CachedAccessor("spark", SparkFrameMethods) |
| |
| # create accessor for pandas-on-Spark specific methods. |
| pandas_on_spark = CachedAccessor("pandas_on_spark", PandasOnSparkFrameMethods) |
| |
| # keep the name "koalas" for backward compatibility. |
| koalas = CachedAccessor("koalas", PandasOnSparkFrameMethods) |
| |
| @no_type_check |
| def hist(self, bins=10, **kwds): |
| return self.plot.hist(bins, **kwds) |
| |
| hist.__doc__ = PandasOnSparkPlotAccessor.hist.__doc__ |
| |
| @no_type_check |
| def kde(self, bw_method=None, ind=None, **kwds): |
| return self.plot.kde(bw_method, ind, **kwds) |
| |
| kde.__doc__ = PandasOnSparkPlotAccessor.kde.__doc__ |
| |
| add.__doc__ = _flex_doc_FRAME.format( |
| desc="Addition", op_name="+", equiv="dataframe + other", reverse="radd" |
| ) |
| |
| def radd(self, other: Any) -> "DataFrame": |
| return other + self |
| |
| radd.__doc__ = _flex_doc_FRAME.format( |
| desc="Addition", op_name="+", equiv="other + dataframe", reverse="add" |
| ) |
| |
| def div(self, other: Any) -> "DataFrame": |
| return self / other |
| |
| div.__doc__ = _flex_doc_FRAME.format( |
| desc="Floating division", op_name="/", equiv="dataframe / other", reverse="rdiv" |
| ) |
| |
| divide = div |
| |
| def rdiv(self, other: Any) -> "DataFrame": |
| return other / self |
| |
| rdiv.__doc__ = _flex_doc_FRAME.format( |
| desc="Floating division", op_name="/", equiv="other / dataframe", reverse="div" |
| ) |
| |
| def truediv(self, other: Any) -> "DataFrame": |
| return self / other |
| |
| truediv.__doc__ = _flex_doc_FRAME.format( |
| desc="Floating division", op_name="/", equiv="dataframe / other", reverse="rtruediv" |
| ) |
| |
| def rtruediv(self, other: Any) -> "DataFrame": |
| return other / self |
| |
| rtruediv.__doc__ = _flex_doc_FRAME.format( |
| desc="Floating division", op_name="/", equiv="other / dataframe", reverse="truediv" |
| ) |
| |
| def mul(self, other: Any) -> "DataFrame": |
| return self * other |
| |
| mul.__doc__ = _flex_doc_FRAME.format( |
| desc="Multiplication", op_name="*", equiv="dataframe * other", reverse="rmul" |
| ) |
| |
| multiply = mul |
| |
| def rmul(self, other: Any) -> "DataFrame": |
| return other * self |
| |
| rmul.__doc__ = _flex_doc_FRAME.format( |
| desc="Multiplication", op_name="*", equiv="other * dataframe", reverse="mul" |
| ) |
| |
| def sub(self, other: Any) -> "DataFrame": |
| return self - other |
| |
| sub.__doc__ = _flex_doc_FRAME.format( |
| desc="Subtraction", op_name="-", equiv="dataframe - other", reverse="rsub" |
| ) |
| |
| subtract = sub |
| |
| def rsub(self, other: Any) -> "DataFrame": |
| return other - self |
| |
| rsub.__doc__ = _flex_doc_FRAME.format( |
| desc="Subtraction", op_name="-", equiv="other - dataframe", reverse="sub" |
| ) |
| |
| def mod(self, other: Any) -> "DataFrame": |
| return self % other |
| |
| mod.__doc__ = _flex_doc_FRAME.format( |
| desc="Modulo", op_name="%", equiv="dataframe % other", reverse="rmod" |
| ) |
| |
| def rmod(self, other: Any) -> "DataFrame": |
| return other % self |
| |
| rmod.__doc__ = _flex_doc_FRAME.format( |
| desc="Modulo", op_name="%", equiv="other % dataframe", reverse="mod" |
| ) |
| |
| def pow(self, other: Any) -> "DataFrame": |
| return self ** other |
| |
| pow.__doc__ = _flex_doc_FRAME.format( |
| desc="Exponential power of series", op_name="**", equiv="dataframe ** other", reverse="rpow" |
| ) |
| |
| def rpow(self, other: Any) -> "DataFrame": |
| return other ** self |
| |
| rpow.__doc__ = _flex_doc_FRAME.format( |
| desc="Exponential power", op_name="**", equiv="other ** dataframe", reverse="pow" |
| ) |
| |
| def floordiv(self, other: Any) -> "DataFrame": |
| return self // other |
| |
| floordiv.__doc__ = _flex_doc_FRAME.format( |
| desc="Integer division", op_name="//", equiv="dataframe // other", reverse="rfloordiv" |
| ) |
| |
| def rfloordiv(self, other: Any) -> "DataFrame": |
| return other // self |
| |
| rfloordiv.__doc__ = _flex_doc_FRAME.format( |
| desc="Integer division", op_name="//", equiv="other // dataframe", reverse="floordiv" |
| ) |
| |
| # Comparison Operators |
| def __eq__(self, other: Any) -> "DataFrame": # type: ignore[override] |
| return self._map_series_op("eq", other) |
| |
| def __ne__(self, other: Any) -> "DataFrame": # type: ignore[override] |
| return self._map_series_op("ne", other) |
| |
| def __lt__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("lt", other) |
| |
| def __le__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("le", other) |
| |
| def __ge__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("ge", other) |
| |
| def __gt__(self, other: Any) -> "DataFrame": |
| return self._map_series_op("gt", other) |
| |
| def eq(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is equal to the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.eq(1) |
| a b |
| a True True |
| b False False |
| c False True |
| d False False |
| """ |
| return self == other |
| |
| equals = eq |
| |
| def gt(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is greater than the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.gt(2) |
| a b |
| a False False |
| b False False |
| c True False |
| d True False |
| """ |
| return self > other |
| |
| def ge(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is greater than or equal to the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.ge(1) |
| a b |
| a True True |
| b True False |
| c True True |
| d True False |
| """ |
| return self >= other |
| |
| def lt(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is less than the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.lt(1) |
| a b |
| a False False |
| b False False |
| c False False |
| d False False |
| """ |
| return self < other |
| |
| def le(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is less than or equal to the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.le(2) |
| a b |
| a True True |
| b True False |
| c False True |
| d False False |
| """ |
| return self <= other |
| |
| def ne(self, other: Any) -> "DataFrame": |
| """ |
| Compare if the current value is not equal to the other. |
| |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4], |
| ... 'b': [1, np.nan, 1, np.nan]}, |
| ... index=['a', 'b', 'c', 'd'], columns=['a', 'b']) |
| |
| >>> df.ne(1) |
| a b |
| a False False |
| b True True |
| c True False |
| d True True |
| """ |
| return self != other |
| |
| def applymap(self, func: Callable[[Any], Any]) -> "DataFrame": |
| """ |
| Apply a function to a Dataframe elementwise. |
| |
| This method applies a function that accepts and returns a scalar |
| to every element of a DataFrame. |
| |
| .. note:: this API executes the function once to infer the type which is |
| potentially expensive, for instance, when the dataset is created after |
| aggregations or sorting. |
| |
| To avoid this, specify return type in ``func``, for instance, as below: |
| |
| >>> def square(x) -> np.int32: |
| ... return x ** 2 |
| |
| pandas-on-Spark uses return type hint and does not try to infer the type. |
| |
| Parameters |
| ---------- |
| func : callable |
| Python function, returns a single value from a single value. |
| |
| Returns |
| ------- |
| DataFrame |
| Transformed DataFrame. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([[1, 2.12], [3.356, 4.567]]) |
| >>> df |
| 0 1 |
| 0 1.000 2.120 |
| 1 3.356 4.567 |
| |
| >>> def str_len(x) -> int: |
| ... return len(str(x)) |
| >>> df.applymap(str_len) |
| 0 1 |
| 0 3 4 |
| 1 5 5 |
| |
| >>> def power(x) -> float: |
| ... return x ** 2 |
| >>> df.applymap(power) |
| 0 1 |
| 0 1.000000 4.494400 |
| 1 11.262736 20.857489 |
| |
| You can omit the type hint and let pandas-on-Spark infer its type. |
| |
| >>> df.applymap(lambda x: x ** 2) |
| 0 1 |
| 0 1.000000 4.494400 |
| 1 11.262736 20.857489 |
| """ |
| |
| # TODO: We can implement shortcut theoretically since it creates new DataFrame |
| # anyway and we don't have to worry about operations on different DataFrames. |
| return self._apply_series_op(lambda psser: psser.apply(func)) |
| |
| # TODO: not all arguments are implemented comparing to pandas' for now. |
| def aggregate(self, func: Union[List[str], Dict[Name, List[str]]]) -> "DataFrame": |
| """Aggregate using one or more operations over the specified axis. |
| |
| Parameters |
| ---------- |
| func : dict or a list |
| a dict mapping from column name (string) to |
| aggregate functions (list of strings). |
| If a list is given, the aggregation is performed against |
| all columns. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Notes |
| ----- |
| `agg` is an alias for `aggregate`. Use the alias. |
| |
| See Also |
| -------- |
| DataFrame.apply : Invoke function on DataFrame. |
| DataFrame.transform : Only perform transforming type operations. |
| DataFrame.groupby : Perform operations over groups. |
| Series.aggregate : The equivalent function for Series. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([[1, 2, 3], |
| ... [4, 5, 6], |
| ... [7, 8, 9], |
| ... [np.nan, np.nan, np.nan]], |
| ... columns=['A', 'B', 'C']) |
| |
| >>> df |
| A B C |
| 0 1.0 2.0 3.0 |
| 1 4.0 5.0 6.0 |
| 2 7.0 8.0 9.0 |
| 3 NaN NaN NaN |
| |
| Aggregate these functions over the rows. |
| |
| >>> df.agg(['sum', 'min'])[['A', 'B', 'C']].sort_index() |
| A B C |
| min 1.0 2.0 3.0 |
| sum 12.0 15.0 18.0 |
| |
| Different aggregations per column. |
| |
| >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})[['A', 'B']].sort_index() |
| A B |
| max NaN 8.0 |
| min 1.0 2.0 |
| sum 12.0 NaN |
| |
| For multi-index columns: |
| |
| >>> df.columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C")]) |
| >>> df.agg(['sum', 'min'])[[("X", "A"), ("X", "B"), ("Y", "C")]].sort_index() |
| X Y |
| A B C |
| min 1.0 2.0 3.0 |
| sum 12.0 15.0 18.0 |
| |
| >>> aggregated = df.agg({("X", "A") : ['sum', 'min'], ("X", "B") : ['min', 'max']}) |
| >>> aggregated[[("X", "A"), ("X", "B")]].sort_index() # doctest: +NORMALIZE_WHITESPACE |
| X |
| A B |
| max NaN 8.0 |
| min 1.0 2.0 |
| sum 12.0 NaN |
| """ |
| from pyspark.pandas.groupby import GroupBy |
| |
| if isinstance(func, list): |
| if all((isinstance(f, str) for f in func)): |
| func = dict([(column, func) for column in self.columns]) |
| else: |
| raise ValueError( |
| "If the given function is a list, it " |
| "should only contains function names as strings." |
| ) |
| |
| if not isinstance(func, dict) or not all( |
| is_name_like_value(key) |
| and ( |
| isinstance(value, str) |
| or (isinstance(value, list) and all(isinstance(v, str) for v in value)) |
| ) |
| for key, value in func.items() |
| ): |
| raise ValueError( |
| "aggs must be a dict mapping from column name to aggregate " |
| "functions (string or list of strings)." |
| ) |
| |
| with option_context("compute.default_index_type", "distributed"): |
| psdf = DataFrame(GroupBy._spark_groupby(self, func)) # type: DataFrame |
| |
| # The codes below basically converts: |
| # |
| # A B |
| # sum min min max |
| # 0 12.0 1.0 2.0 8.0 |
| # |
| # to: |
| # A B |
| # max NaN 8.0 |
| # min 1.0 2.0 |
| # sum 12.0 NaN |
| # |
| # Aggregated output is usually pretty much small. |
| |
| return psdf.stack().droplevel(0)[list(func.keys())] |
| |
| agg = aggregate |
| |
| def corr(self, method: str = "pearson") -> "DataFrame": |
| """ |
| Compute pairwise correlation of columns, excluding NA/null values. |
| |
| Parameters |
| ---------- |
| method : {'pearson', 'spearman'} |
| * pearson : standard correlation coefficient |
| * spearman : Spearman rank correlation |
| |
| Returns |
| ------- |
| y : DataFrame |
| |
| See Also |
| -------- |
| Series.corr |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], |
| ... columns=['dogs', 'cats']) |
| >>> df.corr('pearson') |
| dogs cats |
| dogs 1.000000 -0.851064 |
| cats -0.851064 1.000000 |
| |
| >>> df.corr('spearman') |
| dogs cats |
| dogs 1.000000 -0.948683 |
| cats -0.948683 1.000000 |
| |
| Notes |
| ----- |
| There are behavior differences between pandas-on-Spark and pandas. |
| |
| * the `method` argument only accepts 'pearson', 'spearman' |
| * the data should not contain NaNs. pandas-on-Spark will return an error. |
| * pandas-on-Spark doesn't support the following argument(s). |
| |
| * `min_periods` argument is not supported |
| """ |
| return cast(DataFrame, ps.from_pandas(corr(self, method))) |
| |
| def iteritems(self) -> Iterator[Tuple[Name, "Series"]]: |
| """ |
| Iterator over (column name, Series) pairs. |
| |
| Iterates over the DataFrame columns, returning a tuple with |
| the column name and the content as a Series. |
| |
| Returns |
| ------- |
| label : object |
| The column names for the DataFrame being iterated over. |
| content : Series |
| The column entries belonging to each label, as a Series. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'species': ['bear', 'bear', 'marsupial'], |
| ... 'population': [1864, 22000, 80000]}, |
| ... index=['panda', 'polar', 'koala'], |
| ... columns=['species', 'population']) |
| >>> df |
| species population |
| panda bear 1864 |
| polar bear 22000 |
| koala marsupial 80000 |
| |
| >>> for label, content in df.iteritems(): |
| ... print('label:', label) |
| ... print('content:', content.to_string()) |
| ... |
| label: species |
| content: panda bear |
| polar bear |
| koala marsupial |
| label: population |
| content: panda 1864 |
| polar 22000 |
| koala 80000 |
| """ |
| return ( |
| (label if len(label) > 1 else label[0], self._psser_for(label)) |
| for label in self._internal.column_labels |
| ) |
| |
| def iterrows(self) -> Iterator[Tuple[Name, pd.Series]]: |
| """ |
| Iterate over DataFrame rows as (index, Series) pairs. |
| |
| Yields |
| ------ |
| index : label or tuple of label |
| The index of the row. A tuple for a `MultiIndex`. |
| data : pandas.Series |
| The data of the row as a Series. |
| |
| it : generator |
| A generator that iterates over the rows of the frame. |
| |
| Notes |
| ----- |
| |
| 1. Because ``iterrows`` returns a Series for each row, |
| it does **not** preserve dtypes across the rows (dtypes are |
| preserved across columns for DataFrames). For example, |
| |
| >>> df = ps.DataFrame([[1, 1.5]], columns=['int', 'float']) |
| >>> row = next(df.iterrows())[1] |
| >>> row |
| int 1.0 |
| float 1.5 |
| Name: 0, dtype: float64 |
| >>> print(row['int'].dtype) |
| float64 |
| >>> print(df['int'].dtype) |
| int64 |
| |
| To preserve dtypes while iterating over the rows, it is better |
| to use :meth:`itertuples` which returns namedtuples of the values |
| and which is generally faster than ``iterrows``. |
| |
| 2. You should **never modify** something you are iterating over. |
| This is not guaranteed to work in all cases. Depending on the |
| data types, the iterator returns a copy and not a view, and writing |
| to it will have no effect. |
| """ |
| |
| columns = self.columns |
| internal_index_columns = self._internal.index_spark_column_names |
| internal_data_columns = self._internal.data_spark_column_names |
| |
| def extract_kv_from_spark_row(row: Row) -> Tuple[Name, Any]: |
| k = ( |
| row[internal_index_columns[0]] |
| if len(internal_index_columns) == 1 |
| else tuple(row[c] for c in internal_index_columns) |
| ) |
| v = [row[c] for c in internal_data_columns] |
| return k, v |
| |
| for k, v in map( |
| extract_kv_from_spark_row, self._internal.resolved_copy.spark_frame.toLocalIterator() |
| ): |
| s = pd.Series(v, index=columns, name=k) |
| yield k, s |
| |
| def itertuples( |
| self, index: bool = True, name: Optional[str] = "PandasOnSpark" |
| ) -> Iterator[Tuple]: |
| """ |
| Iterate over DataFrame rows as namedtuples. |
| |
| Parameters |
| ---------- |
| index : bool, default True |
| If True, return the index as the first element of the tuple. |
| name : str or None, default "PandasOnSpark" |
| The name of the returned namedtuples or None to return regular |
| tuples. |
| |
| Returns |
| ------- |
| iterator |
| An object to iterate over namedtuples for each row in the |
| DataFrame with the first field possibly being the index and |
| following fields being the column values. |
| |
| See Also |
| -------- |
| DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) |
| pairs. |
| DataFrame.items : Iterate over (column name, Series) pairs. |
| |
| Notes |
| ----- |
| The column names will be renamed to positional names if they are |
| invalid Python identifiers, repeated, or start with an underscore. |
| On python versions < 3.7 regular tuples are returned for DataFrames |
| with a large number of columns (>254). |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, |
| ... index=['dog', 'hawk']) |
| >>> df |
| num_legs num_wings |
| dog 4 0 |
| hawk 2 2 |
| |
| >>> for row in df.itertuples(): |
| ... print(row) |
| ... |
| PandasOnSpark(Index='dog', num_legs=4, num_wings=0) |
| PandasOnSpark(Index='hawk', num_legs=2, num_wings=2) |
| |
| By setting the `index` parameter to False we can remove the index |
| as the first element of the tuple: |
| |
| >>> for row in df.itertuples(index=False): |
| ... print(row) |
| ... |
| PandasOnSpark(num_legs=4, num_wings=0) |
| PandasOnSpark(num_legs=2, num_wings=2) |
| |
| With the `name` parameter set we set a custom name for the yielded |
| namedtuples: |
| |
| >>> for row in df.itertuples(name='Animal'): |
| ... print(row) |
| ... |
| Animal(Index='dog', num_legs=4, num_wings=0) |
| Animal(Index='hawk', num_legs=2, num_wings=2) |
| """ |
| fields = list(self.columns) |
| if index: |
| fields.insert(0, "Index") |
| |
| index_spark_column_names = self._internal.index_spark_column_names |
| data_spark_column_names = self._internal.data_spark_column_names |
| |
| def extract_kv_from_spark_row(row: Row) -> Tuple[Name, Any]: |
| k = ( |
| row[index_spark_column_names[0]] |
| if len(index_spark_column_names) == 1 |
| else tuple(row[c] for c in index_spark_column_names) |
| ) |
| v = [row[c] for c in data_spark_column_names] |
| return k, v |
| |
| can_return_named_tuples = sys.version_info >= (3, 7) or len(self.columns) + index < 255 |
| |
| if name is not None and can_return_named_tuples: |
| itertuple = namedtuple(name, fields, rename=True) # type: ignore |
| for k, v in map( |
| extract_kv_from_spark_row, |
| self._internal.resolved_copy.spark_frame.toLocalIterator(), |
| ): |
| yield itertuple._make(([k] if index else []) + list(v)) |
| else: |
| for k, v in map( |
| extract_kv_from_spark_row, |
| self._internal.resolved_copy.spark_frame.toLocalIterator(), |
| ): |
| yield tuple(([k] if index else []) + list(v)) |
| |
| def items(self) -> Iterator[Tuple[Name, "Series"]]: |
| """This is an alias of ``iteritems``.""" |
| return self.iteritems() |
| |
| def to_clipboard(self, excel: bool = True, sep: Optional[str] = None, **kwargs: Any) -> None: |
| """ |
| Copy object to the system clipboard. |
| |
| Write a text representation of object to the system clipboard. |
| This can be pasted into Excel, for example. |
| |
| .. note:: This method should only be used if the resulting DataFrame is expected |
| to be small, as all the data is loaded into the driver's memory. |
| |
| Parameters |
| ---------- |
| excel : bool, default True |
| - True, use the provided separator, writing in a csv format for |
| allowing easy pasting into excel. |
| - False, write a string representation of the object to the |
| clipboard. |
| |
| sep : str, default ``'\\t'`` |
| Field delimiter. |
| **kwargs |
| These parameters will be passed to DataFrame.to_csv. |
| |
| Notes |
| ----- |
| Requirements for your platform. |
| |
| - Linux : `xclip`, or `xsel` (with `gtk` or `PyQt4` modules) |
| - Windows : none |
| - OS X : none |
| |
| See Also |
| -------- |
| read_clipboard : Read text from clipboard. |
| |
| Examples |
| -------- |
| Copy the contents of a DataFrame to the clipboard. |
| |
| >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C']) # doctest: +SKIP |
| >>> df.to_clipboard(sep=',') # doctest: +SKIP |
| ... # Wrote the following to the system clipboard: |
| ... # ,A,B,C |
| ... # 0,1,2,3 |
| ... # 1,4,5,6 |
| |
| We can omit the index by passing the keyword `index` and setting |
| it to false. |
| |
| >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP |
| ... # Wrote the following to the system clipboard: |
| ... # A,B,C |
| ... # 1,2,3 |
| ... # 4,5,6 |
| |
| This function also works for Series: |
| |
| >>> df = ps.Series([1, 2, 3, 4, 5, 6, 7], name='x') # doctest: +SKIP |
| >>> df.to_clipboard(sep=',') # doctest: +SKIP |
| ... # Wrote the following to the system clipboard: |
| ... # 0, 1 |
| ... # 1, 2 |
| ... # 2, 3 |
| ... # 3, 4 |
| ... # 4, 5 |
| ... # 5, 6 |
| ... # 6, 7 |
| """ |
| |
| args = locals() |
| psdf = self |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_clipboard, pd.DataFrame.to_clipboard, args |
| ) |
| |
| def to_html( |
| self, |
| buf: Optional[IO[str]] = None, |
| columns: Optional[Sequence[Name]] = None, |
| col_space: Optional[Union[str, int, Dict[Name, Union[str, int]]]] = None, |
| header: bool = True, |
| index: bool = True, |
| na_rep: str = "NaN", |
| formatters: Optional[ |
| Union[List[Callable[[Any], str]], Dict[Name, Callable[[Any], str]]] |
| ] = None, |
| float_format: Optional[Callable[[float], str]] = None, |
| sparsify: Optional[bool] = None, |
| index_names: bool = True, |
| justify: Optional[str] = None, |
| max_rows: Optional[int] = None, |
| max_cols: Optional[int] = None, |
| show_dimensions: bool = False, |
| decimal: str = ".", |
| bold_rows: bool = True, |
| classes: Optional[Union[str, list, tuple]] = None, |
| escape: bool = True, |
| notebook: bool = False, |
| border: Optional[int] = None, |
| table_id: Optional[str] = None, |
| render_links: bool = False, |
| ) -> Optional[str]: |
| """ |
| Render a DataFrame as an HTML table. |
| |
| .. note:: This method should only be used if the resulting pandas object is expected |
| to be small, as all the data is loaded into the driver's memory. If the input |
| is large, set max_rows parameter. |
| |
| Parameters |
| ---------- |
| buf : StringIO-like, optional |
| Buffer to write to. |
| columns : sequence, optional, default None |
| The subset of columns to write. Writes all columns by default. |
| col_space : int, optional |
| The minimum width of each column. |
| header : bool, optional |
| Write out the column names. If a list of strings is given, it |
| is assumed to be aliases for the column names |
| index : bool, optional, default True |
| Whether to print index (row) labels. |
| na_rep : str, optional, default 'NaN' |
| String representation of NAN to use. |
| formatters : list or dict of one-param. functions, optional |
| Formatter functions to apply to columns' elements by position or |
| name. |
| The result of each function must be a unicode string. |
| List must be of length equal to the number of columns. |
| float_format : one-parameter function, optional, default None |
| Formatter function to apply to columns' elements if they are |
| floats. The result of this function must be a unicode string. |
| sparsify : bool, optional, default True |
| Set to False for a DataFrame with a hierarchical index to print |
| every multiindex key at each row. |
| index_names : bool, optional, default True |
| Prints the names of the indexes. |
| justify : str, default None |
| How to justify the column labels. If None uses the option from |
| the print configuration (controlled by set_option), 'right' out |
| of the box. Valid values are |
| |
| * left |
| * right |
| * center |
| * justify |
| * justify-all |
| * start |
| * end |
| * inherit |
| * match-parent |
| * initial |
| * unset. |
| max_rows : int, optional |
| Maximum number of rows to display in the console. |
| max_cols : int, optional |
| Maximum number of columns to display in the console. |
| show_dimensions : bool, default False |
| Display DataFrame dimensions (number of rows by number of columns). |
| decimal : str, default '.' |
| Character recognized as decimal separator, e.g. ',' in Europe. |
| bold_rows : bool, default True |
| Make the row labels bold in the output. |
| classes : str or list or tuple, default None |
| CSS class(es) to apply to the resulting html table. |
| escape : bool, default True |
| Convert the characters <, >, and & to HTML-safe sequences. |
| notebook : {True, False}, default False |
| Whether the generated HTML is for IPython Notebook. |
| border : int |
| A ``border=border`` attribute is included in the opening |
| `<table>` tag. Default ``pd.options.html.border``. |
| table_id : str, optional |
| A css id is included in the opening `<table>` tag if specified. |
| render_links : bool, default False |
| Convert URLs to HTML links (only works with pandas 0.24+). |
| |
| Returns |
| ------- |
| str (or unicode, depending on data and options) |
| String representation of the dataframe. |
| |
| See Also |
| -------- |
| to_string : Convert DataFrame to a string. |
| """ |
| # Make sure locals() call is at the top of the function so we don't capture local variables. |
| args = locals() |
| if max_rows is not None: |
| psdf = self.head(max_rows) |
| else: |
| psdf = self |
| |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_html, pd.DataFrame.to_html, args |
| ) |
| |
| def to_string( |
| self, |
| buf: Optional[IO[str]] = None, |
| columns: Optional[Sequence[Name]] = None, |
| col_space: Optional[Union[str, int, Dict[Name, Union[str, int]]]] = None, |
| header: bool = True, |
| index: bool = True, |
| na_rep: str = "NaN", |
| formatters: Optional[ |
| Union[List[Callable[[Any], str]], Dict[Name, Callable[[Any], str]]] |
| ] = None, |
| float_format: Optional[Callable[[float], str]] = None, |
| sparsify: Optional[bool] = None, |
| index_names: bool = True, |
| justify: Optional[str] = None, |
| max_rows: Optional[int] = None, |
| max_cols: Optional[int] = None, |
| show_dimensions: bool = False, |
| decimal: str = ".", |
| line_width: Optional[int] = None, |
| ) -> Optional[str]: |
| """ |
| Render a DataFrame to a console-friendly tabular output. |
| |
| .. note:: This method should only be used if the resulting pandas object is expected |
| to be small, as all the data is loaded into the driver's memory. If the input |
| is large, set max_rows parameter. |
| |
| Parameters |
| ---------- |
| buf : StringIO-like, optional |
| Buffer to write to. |
| columns : sequence, optional, default None |
| The subset of columns to write. Writes all columns by default. |
| col_space : int, optional |
| The minimum width of each column. |
| header : bool, optional |
| Write out the column names. If a list of strings is given, it |
| is assumed to be aliases for the column names |
| index : bool, optional, default True |
| Whether to print index (row) labels. |
| na_rep : str, optional, default 'NaN' |
| String representation of NAN to use. |
| formatters : list or dict of one-param. functions, optional |
| Formatter functions to apply to columns' elements by position or |
| name. |
| The result of each function must be a unicode string. |
| List must be of length equal to the number of columns. |
| float_format : one-parameter function, optional, default None |
| Formatter function to apply to columns' elements if they are |
| floats. The result of this function must be a unicode string. |
| sparsify : bool, optional, default True |
| Set to False for a DataFrame with a hierarchical index to print |
| every multiindex key at each row. |
| index_names : bool, optional, default True |
| Prints the names of the indexes. |
| justify : str, default None |
| How to justify the column labels. If None uses the option from |
| the print configuration (controlled by set_option), 'right' out |
| of the box. Valid values are |
| |
| * left |
| * right |
| * center |
| * justify |
| * justify-all |
| * start |
| * end |
| * inherit |
| * match-parent |
| * initial |
| * unset. |
| max_rows : int, optional |
| Maximum number of rows to display in the console. |
| max_cols : int, optional |
| Maximum number of columns to display in the console. |
| show_dimensions : bool, default False |
| Display DataFrame dimensions (number of rows by number of columns). |
| decimal : str, default '.' |
| Character recognized as decimal separator, e.g. ',' in Europe. |
| line_width : int, optional |
| Width to wrap a line in characters. |
| |
| Returns |
| ------- |
| str (or unicode, depending on data and options) |
| String representation of the dataframe. |
| |
| See Also |
| -------- |
| to_html : Convert DataFrame to HTML. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]}, columns=['col1', 'col2']) |
| >>> print(df.to_string()) |
| col1 col2 |
| 0 1 4 |
| 1 2 5 |
| 2 3 6 |
| |
| >>> print(df.to_string(max_rows=2)) |
| col1 col2 |
| 0 1 4 |
| 1 2 5 |
| """ |
| # Make sure locals() call is at the top of the function so we don't capture local variables. |
| args = locals() |
| if max_rows is not None: |
| psdf = self.head(max_rows) |
| else: |
| psdf = self |
| |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_string, pd.DataFrame.to_string, args |
| ) |
| |
| def to_dict(self, orient: str = "dict", into: Type = dict) -> Union[List, Mapping]: |
| """ |
| Convert the DataFrame to a dictionary. |
| |
| The type of the key-value pairs can be customized with the parameters |
| (see below). |
| |
| .. note:: This method should only be used if the resulting pandas DataFrame is expected |
| to be small, as all the data is loaded into the driver's memory. |
| |
| Parameters |
| ---------- |
| orient : str {'dict', 'list', 'series', 'split', 'records', 'index'} |
| Determines the type of the values of the dictionary. |
| |
| - 'dict' (default) : dict like {column -> {index -> value}} |
| - 'list' : dict like {column -> [values]} |
| - 'series' : dict like {column -> Series(values)} |
| - 'split' : dict like |
| {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} |
| - 'records' : list like |
| [{column -> value}, ... , {column -> value}] |
| - 'index' : dict like {index -> {column -> value}} |
| |
| Abbreviations are allowed. `s` indicates `series` and `sp` |
| indicates `split`. |
| |
| into : class, default dict |
| The collections.abc.Mapping subclass used for all Mappings |
| in the return value. Can be the actual class or an empty |
| instance of the mapping type you want. If you want a |
| collections.defaultdict, you must pass it initialized. |
| |
| Returns |
| ------- |
| dict, list or collections.abc.Mapping |
| Return a collections.abc.Mapping object representing the DataFrame. |
| The resulting transformation depends on the `orient` parameter. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'col1': [1, 2], |
| ... 'col2': [0.5, 0.75]}, |
| ... index=['row1', 'row2'], |
| ... columns=['col1', 'col2']) |
| >>> df |
| col1 col2 |
| row1 1 0.50 |
| row2 2 0.75 |
| |
| >>> df_dict = df.to_dict() |
| >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()]) |
| [('col1', [('row1', 1), ('row2', 2)]), ('col2', [('row1', 0.5), ('row2', 0.75)])] |
| |
| You can specify the return orientation. |
| |
| >>> df_dict = df.to_dict('series') |
| >>> sorted(df_dict.items()) |
| [('col1', row1 1 |
| row2 2 |
| Name: col1, dtype: int64), ('col2', row1 0.50 |
| row2 0.75 |
| Name: col2, dtype: float64)] |
| |
| >>> df_dict = df.to_dict('split') |
| >>> sorted(df_dict.items()) # doctest: +ELLIPSIS |
| [('columns', ['col1', 'col2']), ('data', [[1..., 0.75]]), ('index', ['row1', 'row2'])] |
| |
| >>> df_dict = df.to_dict('records') |
| >>> [sorted(values.items()) for values in df_dict] # doctest: +ELLIPSIS |
| [[('col1', 1...), ('col2', 0.5)], [('col1', 2...), ('col2', 0.75)]] |
| |
| >>> df_dict = df.to_dict('index') |
| >>> sorted([(key, sorted(values.items())) for key, values in df_dict.items()]) |
| [('row1', [('col1', 1), ('col2', 0.5)]), ('row2', [('col1', 2), ('col2', 0.75)])] |
| |
| You can also specify the mapping type. |
| |
| >>> from collections import OrderedDict, defaultdict |
| >>> df.to_dict(into=OrderedDict) |
| OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), \ |
| ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) |
| |
| If you want a `defaultdict`, you need to initialize it: |
| |
| >>> dd = defaultdict(list) |
| >>> df.to_dict('records', into=dd) # doctest: +ELLIPSIS |
| [defaultdict(<class 'list'>, {'col..., 'col...}), \ |
| defaultdict(<class 'list'>, {'col..., 'col...})] |
| """ |
| # Make sure locals() call is at the top of the function so we don't capture local variables. |
| args = locals() |
| psdf = self |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_dict, pd.DataFrame.to_dict, args |
| ) |
| |
| def to_latex( |
| self, |
| buf: Optional[IO[str]] = None, |
| columns: Optional[List[Name]] = None, |
| col_space: Optional[int] = None, |
| header: bool = True, |
| index: bool = True, |
| na_rep: str = "NaN", |
| formatters: Optional[ |
| Union[List[Callable[[Any], str]], Dict[Name, Callable[[Any], str]]] |
| ] = None, |
| float_format: Optional[Callable[[float], str]] = None, |
| sparsify: Optional[bool] = None, |
| index_names: bool = True, |
| bold_rows: bool = False, |
| column_format: Optional[str] = None, |
| longtable: Optional[bool] = None, |
| escape: Optional[bool] = None, |
| encoding: Optional[str] = None, |
| decimal: str = ".", |
| multicolumn: Optional[bool] = None, |
| multicolumn_format: Optional[str] = None, |
| multirow: Optional[bool] = None, |
| ) -> Optional[str]: |
| r""" |
| Render an object to a LaTeX tabular environment table. |
| |
| Render an object to a tabular environment table. You can splice this into a LaTeX |
| document. Requires usepackage{booktabs}. |
| |
| .. note:: This method should only be used if the resulting pandas object is expected |
| to be small, as all the data is loaded into the driver's memory. If the input |
| is large, consider alternative formats. |
| |
| Parameters |
| ---------- |
| buf : file descriptor or None |
| Buffer to write to. If None, the output is returned as a string. |
| columns : list of label, optional |
| The subset of columns to write. Writes all columns by default. |
| col_space : int, optional |
| The minimum width of each column. |
| header : bool or list of str, default True |
| Write out the column names. If a list of strings is given, it is assumed to be aliases |
| for the column names. |
| index : bool, default True |
| Write row names (index). |
| na_rep : str, default ‘NaN’ |
| Missing data representation. |
| formatters : list of functions or dict of {str: function}, optional |
| Formatter functions to apply to columns’ elements by position or name. The result of |
| each function must be a unicode string. List must be of length equal to the number of |
| columns. |
| float_format : str, optional |
| Format string for floating point numbers. |
| sparsify : bool, optional |
| Set to False for a DataFrame with a hierarchical index to print every multiindex key at |
| each row. By default, the value will be read from the config module. |
| index_names : bool, default True |
| Prints the names of the indexes. |
| bold_rows : bool, default False |
| Make the row labels bold in the output. |
| column_format : str, optional |
| The columns format as specified in LaTeX table format e.g. ‘rcl’ for 3 columns. By |
| default, ‘l’ will be used for all columns except columns of numbers, which default |
| to ‘r’. |
| longtable : bool, optional |
| By default, the value will be read from the pandas config module. Use a longtable |
| environment instead of tabular. Requires adding a usepackage{longtable} to your LaTeX |
| preamble. |
| escape : bool, optional |
| By default, the value will be read from the pandas config module. When set to False |
| prevents from escaping latex special characters in column names. |
| encoding : str, optional |
| A string representing the encoding to use in the output file, defaults to ‘ascii’ on |
| Python 2 and ‘utf-8’ on Python 3. |
| decimal : str, default ‘.’ |
| Character recognized as decimal separator, e.g. ‘,’ in Europe. |
| multicolumn : bool, default True |
| Use multicolumn to enhance MultiIndex columns. The default will be read from the config |
| module. |
| multicolumn_format : str, default ‘l’ |
| The alignment for multicolumns, similar to column_format The default will be read from |
| the config module. |
| multirow : bool, default False |
| Use multirow to enhance MultiIndex rows. Requires adding a usepackage{multirow} to your |
| LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained |
| rows, separating groups via clines. The default will be read from the pandas config |
| module. |
| |
| Returns |
| ------- |
| str or None |
| If buf is None, returns the resulting LateX format as a string. Otherwise returns None. |
| |
| See Also |
| -------- |
| DataFrame.to_string : Render a DataFrame to a console-friendly |
| tabular output. |
| DataFrame.to_html : Render a DataFrame as an HTML table. |
| |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'name': ['Raphael', 'Donatello'], |
| ... 'mask': ['red', 'purple'], |
| ... 'weapon': ['sai', 'bo staff']}, |
| ... columns=['name', 'mask', 'weapon']) |
| >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE |
| \begin{tabular}{lll} |
| \toprule |
| name & mask & weapon \\ |
| \midrule |
| Raphael & red & sai \\ |
| Donatello & purple & bo staff \\ |
| \bottomrule |
| \end{tabular} |
| <BLANKLINE> |
| """ |
| |
| args = locals() |
| psdf = self |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args |
| ) |
| |
| # TODO: enable doctests once we drop Spark 2.3.x (due to type coercion logic |
| # when creating arrays) |
| def transpose(self) -> "DataFrame": |
| """ |
| Transpose index and columns. |
| |
| Reflect the DataFrame over its main diagonal by writing rows as columns |
| and vice-versa. The property :attr:`.T` is an accessor to the method |
| :meth:`transpose`. |
| |
| .. note:: This method is based on an expensive operation due to the nature |
| of big data. Internally it needs to generate each row for each value, and |
| then group twice - it is a huge operation. To prevent misusage, this method |
| has the 'compute.max_rows' default limit of input length, and raises a ValueError. |
| |
| >>> from pyspark.pandas.config import option_context |
| >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE |
| ... ps.DataFrame({'a': range(1001)}).transpose() |
| Traceback (most recent call last): |
| ... |
| ValueError: Current DataFrame has more then the given limit 1000 rows. |
| Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' |
| to retrieve to retrieve more than 1000 rows. Note that, before changing the |
| 'compute.max_rows', this operation is considerably expensive. |
| |
| Returns |
| ------- |
| DataFrame |
| The transposed DataFrame. |
| |
| Notes |
| ----- |
| Transposing a DataFrame with mixed dtypes will result in a homogeneous |
| DataFrame with the coerced dtype. For instance, if int and float have |
| to be placed in same column, it becomes float. If type coercion is not |
| possible, it fails. |
| |
| Also, note that the values in index should be unique because they become |
| unique column names. |
| |
| In addition, if Spark 2.3 is used, the types should always be exactly same. |
| |
| Examples |
| -------- |
| **Square DataFrame with homogeneous dtype** |
| |
| >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} |
| >>> df1 = ps.DataFrame(data=d1, columns=['col1', 'col2']) |
| >>> df1 |
| col1 col2 |
| 0 1 3 |
| 1 2 4 |
| |
| >>> df1_transposed = df1.T.sort_index() # doctest: +SKIP |
| >>> df1_transposed # doctest: +SKIP |
| 0 1 |
| col1 1 2 |
| col2 3 4 |
| |
| When the dtype is homogeneous in the original DataFrame, we get a |
| transposed DataFrame with the same dtype: |
| |
| >>> df1.dtypes |
| col1 int64 |
| col2 int64 |
| dtype: object |
| >>> df1_transposed.dtypes # doctest: +SKIP |
| 0 int64 |
| 1 int64 |
| dtype: object |
| |
| **Non-square DataFrame with mixed dtypes** |
| |
| >>> d2 = {'score': [9.5, 8], |
| ... 'kids': [0, 0], |
| ... 'age': [12, 22]} |
| >>> df2 = ps.DataFrame(data=d2, columns=['score', 'kids', 'age']) |
| >>> df2 |
| score kids age |
| 0 9.5 0 12 |
| 1 8.0 0 22 |
| |
| >>> df2_transposed = df2.T.sort_index() # doctest: +SKIP |
| >>> df2_transposed # doctest: +SKIP |
| 0 1 |
| age 12.0 22.0 |
| kids 0.0 0.0 |
| score 9.5 8.0 |
| |
| When the DataFrame has mixed dtypes, we get a transposed DataFrame with |
| the coerced dtype: |
| |
| >>> df2.dtypes |
| score float64 |
| kids int64 |
| age int64 |
| dtype: object |
| |
| >>> df2_transposed.dtypes # doctest: +SKIP |
| 0 float64 |
| 1 float64 |
| dtype: object |
| """ |
| max_compute_count = get_option("compute.max_rows") |
| if max_compute_count is not None: |
| pdf = self.head(max_compute_count + 1)._to_internal_pandas() |
| if len(pdf) > max_compute_count: |
| raise ValueError( |
| "Current DataFrame has more then the given limit {0} rows. " |
| "Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' " |
| "to retrieve to retrieve more than {0} rows. Note that, before changing the " |
| "'compute.max_rows', this operation is considerably expensive.".format( |
| max_compute_count |
| ) |
| ) |
| return DataFrame(pdf.transpose()) |
| |
| # Explode the data to be pairs. |
| # |
| # For instance, if the current input DataFrame is as below: |
| # |
| # +------+------+------+------+------+ |
| # |index1|index2|(a,x1)|(a,x2)|(b,x3)| |
| # +------+------+------+------+------+ |
| # | y1| z1| 1| 0| 0| |
| # | y2| z2| 0| 50| 0| |
| # | y3| z3| 3| 2| 1| |
| # +------+------+------+------+------+ |
| # |
| # Output of `exploded_df` becomes as below: |
| # |
| # +-----------------+-----------------+-----------------+-----+ |
| # | index|__index_level_0__|__index_level_1__|value| |
| # +-----------------+-----------------+-----------------+-----+ |
| # |{"a":["y1","z1"]}| a| x1| 1| |
| # |{"a":["y1","z1"]}| a| x2| 0| |
| # |{"a":["y1","z1"]}| b| x3| 0| |
| # |{"a":["y2","z2"]}| a| x1| 0| |
| # |{"a":["y2","z2"]}| a| x2| 50| |
| # |{"a":["y2","z2"]}| b| x3| 0| |
| # |{"a":["y3","z3"]}| a| x1| 3| |
| # |{"a":["y3","z3"]}| a| x2| 2| |
| # |{"a":["y3","z3"]}| b| x3| 1| |
| # +-----------------+-----------------+-----------------+-----+ |
| pairs = F.explode( |
| F.array( |
| *[ |
| F.struct( |
| *[ |
| SF.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) |
| for i, col in enumerate(label) |
| ], |
| *[self._internal.spark_column_for(label).alias("value")], |
| ) |
| for label in self._internal.column_labels |
| ] |
| ) |
| ) |
| |
| exploded_df = self._internal.spark_frame.withColumn("pairs", pairs).select( |
| [ |
| F.to_json( |
| F.struct( |
| F.array(*[scol for scol in self._internal.index_spark_columns]).alias("a") |
| ) |
| ).alias("index"), |
| F.col("pairs.*"), |
| ] |
| ) |
| |
| # After that, executes pivot with key and its index column. |
| # Note that index column should contain unique values since column names |
| # should be unique. |
| internal_index_columns = [ |
| SPARK_INDEX_NAME_FORMAT(i) for i in range(self._internal.column_labels_level) |
| ] |
| pivoted_df = exploded_df.groupBy(internal_index_columns).pivot("index") |
| |
| transposed_df = pivoted_df.agg(F.first(F.col("value"))) |
| |
| new_data_columns = list( |
| filter(lambda x: x not in internal_index_columns, transposed_df.columns) |
| ) |
| |
| column_labels = [ |
| None if len(label) == 1 and label[0] is None else label |
| for label in (tuple(json.loads(col)["a"]) for col in new_data_columns) |
| ] |
| |
| internal = InternalFrame( |
| spark_frame=transposed_df, |
| index_spark_columns=[scol_for(transposed_df, col) for col in internal_index_columns], |
| index_names=self._internal.column_label_names, |
| column_labels=column_labels, |
| data_spark_columns=[scol_for(transposed_df, col) for col in new_data_columns], |
| column_label_names=self._internal.index_names, |
| ) |
| |
| return DataFrame(internal) |
| |
| T = property(transpose) |
| |
| def apply( |
| self, func: Callable, axis: Axis = 0, args: Sequence[Any] = (), **kwds: Any |
| ) -> Union["Series", "DataFrame", "Index"]: |
| """ |
| Apply a function along an axis of the DataFrame. |
| |
| Objects passed to the function are Series objects whose index is |
| either the DataFrame's index (``axis=0``) or the DataFrame's columns |
| (``axis=1``). |
| |
| See also `Transform and apply a function |
| <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. |
| |
| .. note:: when `axis` is 0 or 'index', the `func` is unable to access |
| to the whole input series. pandas-on-Spark internally splits the input series into |
| multiple batches and calls `func` with each batch multiple times. Therefore, operations |
| such as global aggregations are impossible. See the example below. |
| |
| >>> # This case does not return the length of whole series but of the batch internally |
| ... # used. |
| ... def length(s) -> int: |
| ... return len(s) |
| ... |
| >>> df = ps.DataFrame({'A': range(1000)}) |
| >>> df.apply(length, axis=0) # doctest: +SKIP |
| 0 83 |
| 1 83 |
| 2 83 |
| ... |
| 10 83 |
| 11 83 |
| dtype: int32 |
| |
| .. note:: this API executes the function once to infer the type which is |
| potentially expensive, for instance, when the dataset is created after |
| aggregations or sorting. |
| |
| To avoid this, specify the return type as `Series` or scalar value in ``func``, |
| for instance, as below: |
| |
| >>> def square(s) -> ps.Series[np.int32]: |
| ... return s ** 2 |
| |
| pandas-on-Spark uses return type hint and does not try to infer the type. |
| |
| In case when axis is 1, it requires to specify `DataFrame` or scalar value |
| with type hints as below: |
| |
| >>> def plus_one(x) -> ps.DataFrame[float, float]: |
| ... return x + 1 |
| |
| If the return type is specified as `DataFrame`, the output column names become |
| `c0, c1, c2 ... cn`. These names are positionally mapped to the returned |
| DataFrame in ``func``. |
| |
| To specify the column names, you can assign them in a pandas friendly style as below: |
| |
| >>> def plus_one(x) -> ps.DataFrame["a": float, "b": float]: |
| ... return x + 1 |
| |
| >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) |
| >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]: |
| ... return x + 1 |
| |
| However, this way switches the index type to default index type in the output |
| because the type hint cannot express the index type at this moment. Use |
| `reset_index()` to keep index as a workaround. |
| |
| When the given function has the return type annotated, the original index of the |
| DataFrame will be lost and then a default index will be attached to the result. |
| Please be careful about configuring the default index. See also `Default Index Type |
| <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. |
| |
| Parameters |
| ---------- |
| func : function |
| Function to apply to each column or row. |
| axis : {0 or 'index', 1 or 'columns'}, default 0 |
| Axis along which the function is applied: |
| |
| * 0 or 'index': apply function to each column. |
| * 1 or 'columns': apply function to each row. |
| args : tuple |
| Positional arguments to pass to `func` in addition to the |
| array/series. |
| **kwds |
| Additional keyword arguments to pass as keywords arguments to |
| `func`. |
| |
| Returns |
| ------- |
| Series or DataFrame |
| Result of applying ``func`` along the given axis of the |
| DataFrame. |
| |
| See Also |
| -------- |
| DataFrame.applymap : For elementwise operations. |
| DataFrame.aggregate : Only perform aggregating type operations. |
| DataFrame.transform : Only perform transforming type operations. |
| Series.apply : The equivalent function for Series. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([[4, 9]] * 3, columns=['A', 'B']) |
| >>> df |
| A B |
| 0 4 9 |
| 1 4 9 |
| 2 4 9 |
| |
| Using a numpy universal function (in this case the same as |
| ``np.sqrt(df)``): |
| |
| >>> def sqrt(x) -> ps.Series[float]: |
| ... return np.sqrt(x) |
| ... |
| >>> df.apply(sqrt, axis=0) |
| A B |
| 0 2.0 3.0 |
| 1 2.0 3.0 |
| 2 2.0 3.0 |
| |
| You can omit the type hint and let pandas-on-Spark infer its type. |
| |
| >>> df.apply(np.sqrt, axis=0) |
| A B |
| 0 2.0 3.0 |
| 1 2.0 3.0 |
| 2 2.0 3.0 |
| |
| When `axis` is 1 or 'columns', it applies the function for each row. |
| |
| >>> def summation(x) -> np.int64: |
| ... return np.sum(x) |
| ... |
| >>> df.apply(summation, axis=1) |
| 0 13 |
| 1 13 |
| 2 13 |
| dtype: int64 |
| |
| Likewise, you can omit the type hint and let pandas-on-Spark infer its type. |
| |
| >>> df.apply(np.sum, axis=1) |
| 0 13 |
| 1 13 |
| 2 13 |
| dtype: int64 |
| |
| >>> df.apply(max, axis=1) |
| 0 9 |
| 1 9 |
| 2 9 |
| dtype: int64 |
| |
| Returning a list-like will result in a Series |
| |
| >>> df.apply(lambda x: [1, 2], axis=1) |
| 0 [1, 2] |
| 1 [1, 2] |
| 2 [1, 2] |
| dtype: object |
| |
| In order to specify the types when `axis` is '1', it should use DataFrame[...] |
| annotation. In this case, the column names are automatically generated. |
| |
| >>> def identify(x) -> ps.DataFrame['A': np.int64, 'B': np.int64]: |
| ... return x |
| ... |
| >>> df.apply(identify, axis=1) |
| A B |
| 0 4 9 |
| 1 4 9 |
| 2 4 9 |
| |
| You can also specify extra arguments. |
| |
| >>> def plus_two(a, b, c) -> ps.DataFrame[np.int64, np.int64]: |
| ... return a + b + c |
| ... |
| >>> df.apply(plus_two, axis=1, args=(1,), c=3) |
| c0 c1 |
| 0 8 13 |
| 1 8 13 |
| 2 8 13 |
| """ |
| from pyspark.pandas.groupby import GroupBy |
| from pyspark.pandas.series import first_series |
| |
| if not isinstance(func, types.FunctionType): |
| assert callable(func), "the first argument should be a callable function." |
| f = func |
| func = lambda *args, **kwargs: f(*args, **kwargs) |
| |
| axis = validate_axis(axis) |
| should_return_series = False |
| spec = inspect.getfullargspec(func) |
| return_sig = spec.annotations.get("return", None) |
| should_infer_schema = return_sig is None |
| |
| def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: |
| pdf_or_pser = pdf.apply(func, axis=axis, args=args, **kwds) |
| if isinstance(pdf_or_pser, pd.Series): |
| return pdf_or_pser.to_frame() |
| else: |
| return pdf_or_pser |
| |
| self_applied = DataFrame(self._internal.resolved_copy) # type: "DataFrame" |
| |
| column_labels = None # type: Optional[List[Label]] |
| if should_infer_schema: |
| # Here we execute with the first 1000 to get the return type. |
| # If the records were less than 1000, it uses pandas API directly for a shortcut. |
| limit = get_option("compute.shortcut_limit") |
| pdf = self_applied.head(limit + 1)._to_internal_pandas() |
| applied = pdf.apply(func, axis=axis, args=args, **kwds) |
| psser_or_psdf = ps.from_pandas(applied) |
| if len(pdf) <= limit: |
| return psser_or_psdf |
| |
| psdf = psser_or_psdf |
| if isinstance(psser_or_psdf, ps.Series): |
| should_return_series = True |
| psdf = psser_or_psdf._psdf |
| |
| index_fields = [field.normalize_spark_type() for field in psdf._internal.index_fields] |
| data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] |
| |
| return_schema = StructType([field.struct_field for field in index_fields + data_fields]) |
| |
| output_func = GroupBy._make_pandas_df_builder_func( |
| self_applied, apply_func, return_schema, retain_index=True |
| ) |
| sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( |
| lambda iterator: map(output_func, iterator), schema=return_schema |
| ) |
| |
| # If schema is inferred, we can restore indexes too. |
| internal = psdf._internal.with_new_sdf( |
| spark_frame=sdf, index_fields=index_fields, data_fields=data_fields |
| ) |
| else: |
| return_type = infer_return_type(func) |
| require_index_axis = isinstance(return_type, SeriesType) |
| require_column_axis = isinstance(return_type, DataFrameType) |
| |
| if require_index_axis: |
| if axis != 0: |
| raise TypeError( |
| "The given function should specify a scalar or a series as its type " |
| "hints when axis is 0 or 'index'; however, the return type " |
| "was %s" % return_sig |
| ) |
| dtype = cast(SeriesType, return_type).dtype |
| spark_type = cast(SeriesType, return_type).spark_type |
| data_fields = [ |
| InternalField( |
| dtype=dtype, struct_field=StructField(name=name, dataType=spark_type) |
| ) |
| for name in self_applied.columns |
| ] |
| return_schema = StructType([field.struct_field for field in data_fields]) |
| elif require_column_axis: |
| if axis != 1: |
| raise TypeError( |
| "The given function should specify a scalar or a frame as its type " |
| "hints when axis is 1 or 'column'; however, the return type " |
| "was %s" % return_sig |
| ) |
| data_fields = cast(DataFrameType, return_type).fields |
| return_schema = cast(DataFrameType, return_type).spark_type |
| else: |
| # any axis is fine. |
| should_return_series = True |
| spark_type = cast(ScalarType, return_type).spark_type |
| dtype = cast(ScalarType, return_type).dtype |
| data_fields = [ |
| InternalField( |
| dtype=dtype, |
| struct_field=StructField( |
| name=SPARK_DEFAULT_SERIES_NAME, dataType=spark_type |
| ), |
| ) |
| ] |
| return_schema = StructType([field.struct_field for field in data_fields]) |
| column_labels = [None] |
| |
| output_func = GroupBy._make_pandas_df_builder_func( |
| self_applied, apply_func, return_schema, retain_index=False |
| ) |
| sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( |
| lambda iterator: map(output_func, iterator), schema=return_schema |
| ) |
| |
| # Otherwise, it loses index. |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=None, |
| column_labels=column_labels, |
| data_fields=data_fields, |
| ) |
| |
| result = DataFrame(internal) # type: "DataFrame" |
| if should_return_series: |
| return first_series(result) |
| else: |
| return result |
| |
| def transform( |
| self, func: Callable[..., "Series"], axis: Axis = 0, *args: Any, **kwargs: Any |
| ) -> "DataFrame": |
| """ |
| Call ``func`` on self producing a Series with transformed values |
| and that has the same length as its input. |
| |
| See also `Transform and apply a function |
| <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. |
| |
| .. note:: this API executes the function once to infer the type which is |
| potentially expensive, for instance, when the dataset is created after |
| aggregations or sorting. |
| |
| To avoid this, specify return type in ``func``, for instance, as below: |
| |
| >>> def square(x) -> ps.Series[np.int32]: |
| ... return x ** 2 |
| |
| pandas-on-Spark uses return type hint and does not try to infer the type. |
| |
| .. note:: the series within ``func`` is actually multiple pandas series as the |
| segments of the whole pandas-on-Spark series; therefore, the length of each series |
| is not guaranteed. As an example, an aggregation against each series |
| does work as a global aggregation but an aggregation of each segment. See |
| below: |
| |
| >>> def func(x) -> ps.Series[np.int32]: |
| ... return x + sum(x) |
| |
| Parameters |
| ---------- |
| func : function |
| Function to use for transforming the data. It must work when pandas Series |
| is passed. |
| axis : int, default 0 or 'index' |
| Can only be set to 0 at the moment. |
| *args |
| Positional arguments to pass to func. |
| **kwargs |
| Keyword arguments to pass to func. |
| |
| Returns |
| ------- |
| DataFrame |
| A DataFrame that must have the same length as self. |
| |
| Raises |
| ------ |
| Exception : If the returned DataFrame has a different length than self. |
| |
| See Also |
| -------- |
| DataFrame.aggregate : Only perform aggregating type operations. |
| DataFrame.apply : Invoke function on DataFrame. |
| Series.transform : The equivalent function for Series. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': range(3), 'B': range(1, 4)}, columns=['A', 'B']) |
| >>> df |
| A B |
| 0 0 1 |
| 1 1 2 |
| 2 2 3 |
| |
| >>> def square(x) -> ps.Series[np.int32]: |
| ... return x ** 2 |
| >>> df.transform(square) |
| A B |
| 0 0 1 |
| 1 1 4 |
| 2 4 9 |
| |
| You can omit the type hint and let pandas-on-Spark infer its type. |
| |
| >>> df.transform(lambda x: x ** 2) |
| A B |
| 0 0 1 |
| 1 1 4 |
| 2 4 9 |
| |
| For multi-index columns: |
| |
| >>> df.columns = [('X', 'A'), ('X', 'B')] |
| >>> df.transform(square) # doctest: +NORMALIZE_WHITESPACE |
| X |
| A B |
| 0 0 1 |
| 1 1 4 |
| 2 4 9 |
| |
| >>> (df * -1).transform(abs) # doctest: +NORMALIZE_WHITESPACE |
| X |
| A B |
| 0 0 1 |
| 1 1 2 |
| 2 2 3 |
| |
| You can also specify extra arguments. |
| |
| >>> def calculation(x, y, z) -> ps.Series[int]: |
| ... return x ** y + z |
| >>> df.transform(calculation, y=10, z=20) # doctest: +NORMALIZE_WHITESPACE |
| X |
| A B |
| 0 20 21 |
| 1 21 1044 |
| 2 1044 59069 |
| """ |
| if not isinstance(func, types.FunctionType): |
| assert callable(func), "the first argument should be a callable function." |
| f = func |
| func = lambda *args, **kwargs: f(*args, **kwargs) |
| |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| spec = inspect.getfullargspec(func) |
| return_sig = spec.annotations.get("return", None) |
| should_infer_schema = return_sig is None |
| |
| if should_infer_schema: |
| # Here we execute with the first 1000 to get the return type. |
| # If the records were less than 1000, it uses pandas API directly for a shortcut. |
| limit = get_option("compute.shortcut_limit") |
| pdf = self.head(limit + 1)._to_internal_pandas() |
| transformed = pdf.transform(func, axis, *args, **kwargs) |
| psdf = DataFrame(transformed) # type: "DataFrame" |
| if len(pdf) <= limit: |
| return psdf |
| |
| applied = [] |
| data_fields = [] |
| for input_label, output_label in zip( |
| self._internal.column_labels, psdf._internal.column_labels |
| ): |
| psser = self._psser_for(input_label) |
| |
| field = psdf._internal.field_for(output_label).normalize_spark_type() |
| data_fields.append(field) |
| |
| return_schema = field.spark_type |
| applied.append( |
| psser.pandas_on_spark._transform_batch( |
| func=lambda c: func(c, *args, **kwargs), |
| return_type=SeriesType(field.dtype, return_schema), |
| ) |
| ) |
| |
| internal = self._internal.with_new_columns(applied, data_fields=data_fields) |
| return DataFrame(internal) |
| else: |
| return self._apply_series_op( |
| lambda psser: psser.pandas_on_spark.transform_batch(func, *args, **kwargs) |
| ) |
| |
| def pop(self, item: Name) -> "DataFrame": |
| """ |
| Return item and drop from frame. Raise KeyError if not found. |
| |
| Parameters |
| ---------- |
| item : str |
| Label of column to be popped. |
| |
| Returns |
| ------- |
| Series |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([('falcon', 'bird', 389.0), |
| ... ('parrot', 'bird', 24.0), |
| ... ('lion', 'mammal', 80.5), |
| ... ('monkey','mammal', np.nan)], |
| ... columns=('name', 'class', 'max_speed')) |
| |
| >>> df |
| name class max_speed |
| 0 falcon bird 389.0 |
| 1 parrot bird 24.0 |
| 2 lion mammal 80.5 |
| 3 monkey mammal NaN |
| |
| >>> df.pop('class') |
| 0 bird |
| 1 bird |
| 2 mammal |
| 3 mammal |
| Name: class, dtype: object |
| |
| >>> df |
| name max_speed |
| 0 falcon 389.0 |
| 1 parrot 24.0 |
| 2 lion 80.5 |
| 3 monkey NaN |
| |
| Also support for MultiIndex |
| |
| >>> df = ps.DataFrame([('falcon', 'bird', 389.0), |
| ... ('parrot', 'bird', 24.0), |
| ... ('lion', 'mammal', 80.5), |
| ... ('monkey','mammal', np.nan)], |
| ... columns=('name', 'class', 'max_speed')) |
| >>> columns = [('a', 'name'), ('a', 'class'), ('b', 'max_speed')] |
| >>> df.columns = pd.MultiIndex.from_tuples(columns) |
| >>> df |
| a b |
| name class max_speed |
| 0 falcon bird 389.0 |
| 1 parrot bird 24.0 |
| 2 lion mammal 80.5 |
| 3 monkey mammal NaN |
| |
| >>> df.pop('a') |
| name class |
| 0 falcon bird |
| 1 parrot bird |
| 2 lion mammal |
| 3 monkey mammal |
| |
| >>> df |
| b |
| max_speed |
| 0 389.0 |
| 1 24.0 |
| 2 80.5 |
| 3 NaN |
| """ |
| result = self[item] |
| self._update_internal_frame(self.drop(item)._internal) |
| return result |
| |
| # TODO: add axis parameter can work when '1' or 'columns' |
| def xs(self, key: Name, axis: Axis = 0, level: Optional[int] = None) -> DataFrameOrSeries: |
| """ |
| Return cross-section from the DataFrame. |
| |
| This method takes a `key` argument to select data at a particular |
| level of a MultiIndex. |
| |
| Parameters |
| ---------- |
| key : label or tuple of label |
| Label contained in the index, or partially in a MultiIndex. |
| axis : 0 or 'index', default 0 |
| Axis to retrieve cross-section on. |
| currently only support 0 or 'index' |
| level : object, defaults to first n levels (n=1 or len(key)) |
| In case of a key partially contained in a MultiIndex, indicate |
| which levels are used. Levels can be referred by label or position. |
| |
| Returns |
| ------- |
| DataFrame or Series |
| Cross-section from the original DataFrame |
| corresponding to the selected index levels. |
| |
| See Also |
| -------- |
| DataFrame.loc : Access a group of rows and columns |
| by label(s) or a boolean array. |
| DataFrame.iloc : Purely integer-location based indexing |
| for selection by position. |
| |
| Examples |
| -------- |
| >>> d = {'num_legs': [4, 4, 2, 2], |
| ... 'num_wings': [0, 0, 2, 2], |
| ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], |
| ... 'animal': ['cat', 'dog', 'bat', 'penguin'], |
| ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} |
| >>> df = ps.DataFrame(data=d) |
| >>> df = df.set_index(['class', 'animal', 'locomotion']) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_wings |
| class animal locomotion |
| mammal cat walks 4 0 |
| dog walks 4 0 |
| bat flies 2 2 |
| bird penguin walks 2 2 |
| |
| Get values at specified index |
| |
| >>> df.xs('mammal') # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_wings |
| animal locomotion |
| cat walks 4 0 |
| dog walks 4 0 |
| bat flies 2 2 |
| |
| Get values at several indexes |
| |
| >>> df.xs(('mammal', 'dog')) # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_wings |
| locomotion |
| walks 4 0 |
| |
| >>> df.xs(('mammal', 'dog', 'walks')) # doctest: +NORMALIZE_WHITESPACE |
| num_legs 4 |
| num_wings 0 |
| Name: (mammal, dog, walks), dtype: int64 |
| |
| Get values at specified index and level |
| |
| >>> df.xs('cat', level=1) # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_wings |
| class locomotion |
| mammal walks 4 0 |
| """ |
| from pyspark.pandas.series import first_series |
| |
| if not is_name_like_value(key): |
| raise TypeError("'key' should be a scalar value or tuple that contains scalar values") |
| |
| if level is not None and is_name_like_tuple(key): |
| raise KeyError(key) |
| |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| if not is_name_like_tuple(key): |
| key = (key,) |
| if len(key) > self._internal.index_level: |
| raise KeyError( |
| "Key length ({}) exceeds index depth ({})".format( |
| len(key), self._internal.index_level |
| ) |
| ) |
| if level is None: |
| level = 0 |
| |
| rows = [ |
| self._internal.index_spark_columns[lvl] == index for lvl, index in enumerate(key, level) |
| ] |
| internal = self._internal.with_filter(reduce(lambda x, y: x & y, rows)) |
| |
| if len(key) == self._internal.index_level: |
| psdf = DataFrame(internal) # type: DataFrame |
| pdf = psdf.head(2)._to_internal_pandas() |
| if len(pdf) == 0: |
| raise KeyError(key) |
| elif len(pdf) > 1: |
| return psdf |
| else: |
| return first_series(DataFrame(pdf.transpose())) |
| else: |
| index_spark_columns = ( |
| internal.index_spark_columns[:level] |
| + internal.index_spark_columns[level + len(key) :] |
| ) |
| index_names = internal.index_names[:level] + internal.index_names[level + len(key) :] |
| index_fields = internal.index_fields[:level] + internal.index_fields[level + len(key) :] |
| |
| internal = internal.copy( |
| index_spark_columns=index_spark_columns, |
| index_names=index_names, |
| index_fields=index_fields, |
| ).resolved_copy |
| return DataFrame(internal) |
| |
| def between_time( |
| self, |
| start_time: Union[datetime.time, str], |
| end_time: Union[datetime.time, str], |
| include_start: bool = True, |
| include_end: bool = True, |
| axis: Axis = 0, |
| ) -> "DataFrame": |
| """ |
| Select values between particular times of the day (example: 9:00-9:30 AM). |
| |
| By setting ``start_time`` to be later than ``end_time``, |
| you can get the times that are *not* between the two times. |
| |
| Parameters |
| ---------- |
| start_time : datetime.time or str |
| Initial time as a time filter limit. |
| end_time : datetime.time or str |
| End time as a time filter limit. |
| include_start : bool, default True |
| Whether the start time needs to be included in the result. |
| include_end : bool, default True |
| Whether the end time needs to be included in the result. |
| axis : {0 or 'index', 1 or 'columns'}, default 0 |
| Determine range time on index or columns value. |
| |
| Returns |
| ------- |
| DataFrame |
| Data from the original object filtered to the specified dates range. |
| |
| Raises |
| ------ |
| TypeError |
| If the index is not a :class:`DatetimeIndex` |
| |
| See Also |
| -------- |
| at_time : Select values at a particular time of the day. |
| first : Select initial periods of time series based on a date offset. |
| last : Select final periods of time series based on a date offset. |
| DatetimeIndex.indexer_between_time : Get just the index locations for |
| values between particular times of the day. |
| |
| Examples |
| -------- |
| >>> idx = pd.date_range('2018-04-09', periods=4, freq='1D20min') |
| >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx) |
| >>> psdf |
| A |
| 2018-04-09 00:00:00 1 |
| 2018-04-10 00:20:00 2 |
| 2018-04-11 00:40:00 3 |
| 2018-04-12 01:00:00 4 |
| |
| >>> psdf.between_time('0:15', '0:45') |
| A |
| 2018-04-10 00:20:00 2 |
| 2018-04-11 00:40:00 3 |
| |
| You get the times that are *not* between two times by setting |
| ``start_time`` later than ``end_time``: |
| |
| >>> psdf.between_time('0:45', '0:15') |
| A |
| 2018-04-09 00:00:00 1 |
| 2018-04-12 01:00:00 4 |
| """ |
| axis = validate_axis(axis) |
| |
| if axis != 0: |
| raise NotImplementedError("between_time currently only works for axis=0") |
| |
| if not isinstance(self.index, ps.DatetimeIndex): |
| raise TypeError("Index must be DatetimeIndex") |
| |
| psdf = self.copy() |
| psdf.index.name = verify_temp_column_name(psdf, "__index_name__") |
| return_types = [psdf.index.dtype] + list(psdf.dtypes) |
| |
| def pandas_between_time(pdf) -> ps.DataFrame[return_types]: # type: ignore |
| return pdf.between_time(start_time, end_time, include_start, include_end).reset_index() |
| |
| # apply_batch will remove the index of the pandas-on-Spark DataFrame and attach a |
| # default index, which will never be used. So use "distributed" index as a dummy to |
| # avoid overhead. |
| with option_context("compute.default_index_type", "distributed"): |
| psdf = psdf.pandas_on_spark.apply_batch(pandas_between_time) |
| |
| return DataFrame( |
| self._internal.copy( |
| spark_frame=psdf._internal.spark_frame, |
| index_spark_columns=psdf._internal.data_spark_columns[:1], |
| index_fields=psdf._internal.data_fields[:1], |
| data_spark_columns=psdf._internal.data_spark_columns[1:], |
| data_fields=psdf._internal.data_fields[1:], |
| ) |
| ) |
| |
| # TODO: implement axis=1 |
| def at_time( |
| self, time: Union[datetime.time, str], asof: bool = False, axis: Axis = 0 |
| ) -> "DataFrame": |
| """ |
| Select values at particular time of day (example: 9:30AM). |
| |
| Parameters |
| ---------- |
| time : datetime.time or str |
| axis : {0 or 'index', 1 or 'columns'}, default 0 |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Raises |
| ------ |
| TypeError |
| If the index is not a :class:`DatetimeIndex` |
| |
| See Also |
| -------- |
| between_time : Select values between particular times of the day. |
| DatetimeIndex.indexer_at_time : Get just the index locations for |
| values at particular time of the day. |
| |
| Examples |
| -------- |
| >>> idx = pd.date_range('2018-04-09', periods=4, freq='12H') |
| >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=idx) |
| >>> psdf |
| A |
| 2018-04-09 00:00:00 1 |
| 2018-04-09 12:00:00 2 |
| 2018-04-10 00:00:00 3 |
| 2018-04-10 12:00:00 4 |
| |
| >>> psdf.at_time('12:00') |
| A |
| 2018-04-09 12:00:00 2 |
| 2018-04-10 12:00:00 4 |
| """ |
| if asof: |
| raise NotImplementedError("'asof' argument is not supported") |
| |
| axis = validate_axis(axis) |
| |
| if axis != 0: |
| raise NotImplementedError("at_time currently only works for axis=0") |
| |
| if not isinstance(self.index, ps.DatetimeIndex): |
| raise TypeError("Index must be DatetimeIndex") |
| |
| psdf = self.copy() |
| psdf.index.name = verify_temp_column_name(psdf, "__index_name__") |
| return_types = [psdf.index.dtype] + list(psdf.dtypes) |
| |
| if LooseVersion(pd.__version__) < LooseVersion("0.24"): |
| |
| def pandas_at_time(pdf) -> ps.DataFrame[return_types]: # type: ignore |
| return pdf.at_time(time, asof).reset_index() |
| |
| else: |
| |
| def pandas_at_time(pdf) -> ps.DataFrame[return_types]: # type: ignore |
| return pdf.at_time(time, asof, axis).reset_index() |
| |
| # apply_batch will remove the index of the pandas-on-Spark DataFrame and attach |
| # a default index, which will never be used. So use "distributed" index as a dummy |
| # to avoid overhead. |
| with option_context("compute.default_index_type", "distributed"): |
| psdf = psdf.pandas_on_spark.apply_batch(pandas_at_time) |
| |
| return DataFrame( |
| self._internal.copy( |
| spark_frame=psdf._internal.spark_frame, |
| index_spark_columns=psdf._internal.data_spark_columns[:1], |
| index_fields=psdf._internal.data_fields[:1], |
| data_spark_columns=psdf._internal.data_spark_columns[1:], |
| data_fields=psdf._internal.data_fields[1:], |
| ) |
| ) |
| |
| def where( |
| self, cond: DataFrameOrSeries, other: Union[DataFrameOrSeries, Any] = np.nan |
| ) -> "DataFrame": |
| """ |
| Replace values where the condition is False. |
| |
| Parameters |
| ---------- |
| cond : boolean DataFrame |
| Where cond is True, keep the original value. Where False, |
| replace with corresponding value from other. |
| other : scalar, DataFrame |
| Entries where cond is False are replaced with corresponding value from other. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Examples |
| -------- |
| |
| >>> from pyspark.pandas.config import set_option, reset_option |
| >>> set_option("compute.ops_on_diff_frames", True) |
| >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]}) |
| >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]}) |
| >>> df1 |
| A B |
| 0 0 100 |
| 1 1 200 |
| 2 2 300 |
| 3 3 400 |
| 4 4 500 |
| >>> df2 |
| A B |
| 0 0 -100 |
| 1 -1 -200 |
| 2 -2 -300 |
| 3 -3 -400 |
| 4 -4 -500 |
| |
| >>> df1.where(df1 > 0).sort_index() |
| A B |
| 0 NaN 100.0 |
| 1 1.0 200.0 |
| 2 2.0 300.0 |
| 3 3.0 400.0 |
| 4 4.0 500.0 |
| |
| >>> df1.where(df1 > 1, 10).sort_index() |
| A B |
| 0 10 100 |
| 1 10 200 |
| 2 2 300 |
| 3 3 400 |
| 4 4 500 |
| |
| >>> df1.where(df1 > 1, df1 + 100).sort_index() |
| A B |
| 0 100 100 |
| 1 101 200 |
| 2 2 300 |
| 3 3 400 |
| 4 4 500 |
| |
| >>> df1.where(df1 > 1, df2).sort_index() |
| A B |
| 0 0 100 |
| 1 -1 200 |
| 2 2 300 |
| 3 3 400 |
| 4 4 500 |
| |
| When the column name of cond is different from self, it treats all values are False |
| |
| >>> cond = ps.DataFrame({'C': [0, -1, -2, -3, -4], 'D':[4, 3, 2, 1, 0]}) % 3 == 0 |
| >>> cond |
| C D |
| 0 True False |
| 1 False True |
| 2 False False |
| 3 True False |
| 4 False True |
| |
| >>> df1.where(cond).sort_index() |
| A B |
| 0 NaN NaN |
| 1 NaN NaN |
| 2 NaN NaN |
| 3 NaN NaN |
| 4 NaN NaN |
| |
| When the type of cond is Series, it just check boolean regardless of column name |
| |
| >>> cond = ps.Series([1, 2]) > 1 |
| >>> cond |
| 0 False |
| 1 True |
| dtype: bool |
| |
| >>> df1.where(cond).sort_index() |
| A B |
| 0 NaN NaN |
| 1 1.0 200.0 |
| 2 NaN NaN |
| 3 NaN NaN |
| 4 NaN NaN |
| |
| >>> reset_option("compute.ops_on_diff_frames") |
| """ |
| from pyspark.pandas.series import Series |
| |
| tmp_cond_col_name = "__tmp_cond_col_{}__".format |
| tmp_other_col_name = "__tmp_other_col_{}__".format |
| |
| psdf = self.copy() |
| |
| tmp_cond_col_names = [ |
| tmp_cond_col_name(name_like_string(label)) for label in self._internal.column_labels |
| ] |
| if isinstance(cond, DataFrame): |
| cond = cond[ |
| [ |
| ( |
| cond._internal.spark_column_for(label) |
| if label in cond._internal.column_labels |
| else SF.lit(False) |
| ).alias(name) |
| for label, name in zip(self._internal.column_labels, tmp_cond_col_names) |
| ] |
| ] |
| psdf[tmp_cond_col_names] = cond |
| elif isinstance(cond, Series): |
| cond = cond.to_frame() |
| cond = cond[ |
| [cond._internal.data_spark_columns[0].alias(name) for name in tmp_cond_col_names] |
| ] |
| psdf[tmp_cond_col_names] = cond |
| else: |
| raise TypeError("type of cond must be a DataFrame or Series") |
| |
| tmp_other_col_names = [ |
| tmp_other_col_name(name_like_string(label)) for label in self._internal.column_labels |
| ] |
| if isinstance(other, DataFrame): |
| other = other[ |
| [ |
| ( |
| other._internal.spark_column_for(label) |
| if label in other._internal.column_labels |
| else SF.lit(np.nan) |
| ).alias(name) |
| for label, name in zip(self._internal.column_labels, tmp_other_col_names) |
| ] |
| ] |
| psdf[tmp_other_col_names] = other |
| elif isinstance(other, Series): |
| other = other.to_frame() |
| other = other[ |
| [other._internal.data_spark_columns[0].alias(name) for name in tmp_other_col_names] |
| ] |
| psdf[tmp_other_col_names] = other |
| else: |
| for label in self._internal.column_labels: |
| psdf[tmp_other_col_name(name_like_string(label))] = other |
| |
| # above logic make spark dataframe looks like below: |
| # +-----------------+---+---+------------------+-------------------+------------------+--... |
| # |__index_level_0__| A| B|__tmp_cond_col_A__|__tmp_other_col_A__|__tmp_cond_col_B__|__... |
| # +-----------------+---+---+------------------+-------------------+------------------+--... |
| # | 0| 0|100| true| 0| false| ... |
| # | 1| 1|200| false| -1| false| ... |
| # | 3| 3|400| true| -3| false| ... |
| # | 2| 2|300| false| -2| true| ... |
| # | 4| 4|500| false| -4| false| ... |
| # +-----------------+---+---+------------------+-------------------+------------------+--... |
| |
| data_spark_columns = [] |
| for label in self._internal.column_labels: |
| data_spark_columns.append( |
| F.when( |
| psdf[tmp_cond_col_name(name_like_string(label))].spark.column, |
| psdf._internal.spark_column_for(label), |
| ) |
| .otherwise(psdf[tmp_other_col_name(name_like_string(label))].spark.column) |
| .alias(psdf._internal.spark_column_name_for(label)) |
| ) |
| |
| return DataFrame( |
| psdf._internal.with_new_columns( |
| data_spark_columns, column_labels=self._internal.column_labels # TODO: dtypes? |
| ) |
| ) |
| |
| def mask( |
| self, cond: DataFrameOrSeries, other: Union[DataFrameOrSeries, Any] = np.nan |
| ) -> "DataFrame": |
| """ |
| Replace values where the condition is True. |
| |
| Parameters |
| ---------- |
| cond : boolean DataFrame |
| Where cond is False, keep the original value. Where True, |
| replace with corresponding value from other. |
| other : scalar, DataFrame |
| Entries where cond is True are replaced with corresponding value from other. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Examples |
| -------- |
| |
| >>> from pyspark.pandas.config import set_option, reset_option |
| >>> set_option("compute.ops_on_diff_frames", True) |
| >>> df1 = ps.DataFrame({'A': [0, 1, 2, 3, 4], 'B':[100, 200, 300, 400, 500]}) |
| >>> df2 = ps.DataFrame({'A': [0, -1, -2, -3, -4], 'B':[-100, -200, -300, -400, -500]}) |
| >>> df1 |
| A B |
| 0 0 100 |
| 1 1 200 |
| 2 2 300 |
| 3 3 400 |
| 4 4 500 |
| >>> df2 |
| A B |
| 0 0 -100 |
| 1 -1 -200 |
| 2 -2 -300 |
| 3 -3 -400 |
| 4 -4 -500 |
| |
| >>> df1.mask(df1 > 0).sort_index() |
| A B |
| 0 0.0 NaN |
| 1 NaN NaN |
| 2 NaN NaN |
| 3 NaN NaN |
| 4 NaN NaN |
| |
| >>> df1.mask(df1 > 1, 10).sort_index() |
| A B |
| 0 0 10 |
| 1 1 10 |
| 2 10 10 |
| 3 10 10 |
| 4 10 10 |
| |
| >>> df1.mask(df1 > 1, df1 + 100).sort_index() |
| A B |
| 0 0 200 |
| 1 1 300 |
| 2 102 400 |
| 3 103 500 |
| 4 104 600 |
| |
| >>> df1.mask(df1 > 1, df2).sort_index() |
| A B |
| 0 0 -100 |
| 1 1 -200 |
| 2 -2 -300 |
| 3 -3 -400 |
| 4 -4 -500 |
| |
| >>> reset_option("compute.ops_on_diff_frames") |
| """ |
| from pyspark.pandas.series import Series |
| |
| if not isinstance(cond, (DataFrame, Series)): |
| raise TypeError("type of cond must be a DataFrame or Series") |
| |
| cond_inversed = cond._apply_series_op(lambda psser: ~psser) |
| return self.where(cond_inversed, other) |
| |
| @property |
| def index(self) -> "Index": |
| """The index (row labels) Column of the DataFrame. |
| |
| Currently not supported when the DataFrame has no index. |
| |
| See Also |
| -------- |
| Index |
| """ |
| from pyspark.pandas.indexes.base import Index |
| |
| return Index._new_instance(self) |
| |
| @property |
| def empty(self) -> bool: |
| """ |
| Returns true if the current DataFrame is empty. Otherwise, returns false. |
| |
| Examples |
| -------- |
| >>> ps.range(10).empty |
| False |
| |
| >>> ps.range(0).empty |
| True |
| |
| >>> ps.DataFrame({}, index=list('abc')).empty |
| True |
| """ |
| return ( |
| len(self._internal.column_labels) == 0 |
| or self._internal.resolved_copy.spark_frame.rdd.isEmpty() |
| ) |
| |
| @property |
| def style(self) -> "Styler": |
| """ |
| Property returning a Styler object containing methods for |
| building a styled HTML representation for the DataFrame. |
| |
| .. note:: currently it collects top 1000 rows and return its |
| pandas `pandas.io.formats.style.Styler` instance. |
| |
| Examples |
| -------- |
| >>> ps.range(1001).style # doctest: +SKIP |
| <pandas.io.formats.style.Styler object at ...> |
| """ |
| max_results = get_option("compute.max_rows") |
| pdf = self.head(max_results + 1)._to_internal_pandas() |
| if len(pdf) > max_results: |
| warnings.warn("'style' property will only use top %s rows." % max_results, UserWarning) |
| return pdf.head(max_results).style |
| |
| def set_index( |
| self, |
| keys: Union[Name, List[Name]], |
| drop: bool = True, |
| append: bool = False, |
| inplace: bool = False, |
| ) -> Optional["DataFrame"]: |
| """Set the DataFrame index (row labels) using one or more existing columns. |
| |
| Set the DataFrame index (row labels) using one or more existing |
| columns or arrays (of the correct length). The index can replace the |
| existing index or expand on it. |
| |
| Parameters |
| ---------- |
| keys : label or array-like or list of labels/arrays |
| This parameter can be either a single column key, a single array of |
| the same length as the calling DataFrame, or a list containing an |
| arbitrary combination of column keys and arrays. Here, "array" |
| encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. |
| drop : bool, default True |
| Delete columns to be used as the new index. |
| append : bool, default False |
| Whether to append columns to existing index. |
| inplace : bool, default False |
| Modify the DataFrame in place (do not create a new object). |
| |
| Returns |
| ------- |
| DataFrame |
| Changed row labels. |
| |
| See Also |
| -------- |
| DataFrame.reset_index : Opposite of set_index. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'month': [1, 4, 7, 10], |
| ... 'year': [2012, 2014, 2013, 2014], |
| ... 'sale': [55, 40, 84, 31]}, |
| ... columns=['month', 'year', 'sale']) |
| >>> df |
| month year sale |
| 0 1 2012 55 |
| 1 4 2014 40 |
| 2 7 2013 84 |
| 3 10 2014 31 |
| |
| Set the index to become the 'month' column: |
| |
| >>> df.set_index('month') # doctest: +NORMALIZE_WHITESPACE |
| year sale |
| month |
| 1 2012 55 |
| 4 2014 40 |
| 7 2013 84 |
| 10 2014 31 |
| |
| Create a MultiIndex using columns 'year' and 'month': |
| |
| >>> df.set_index(['year', 'month']) # doctest: +NORMALIZE_WHITESPACE |
| sale |
| year month |
| 2012 1 55 |
| 2014 4 40 |
| 2013 7 84 |
| 2014 10 31 |
| """ |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| if is_name_like_tuple(keys): |
| key_list = [cast(Label, keys)] # type: List[Label] |
| elif is_name_like_value(keys): |
| key_list = [(keys,)] |
| else: |
| key_list = [key if is_name_like_tuple(key) else (key,) for key in keys] |
| columns = set(self._internal.column_labels) |
| for key in key_list: |
| if key not in columns: |
| raise KeyError(name_like_string(key)) |
| |
| if drop: |
| column_labels = [ |
| label for label in self._internal.column_labels if label not in key_list |
| ] |
| else: |
| column_labels = self._internal.column_labels |
| if append: |
| index_spark_columns = self._internal.index_spark_columns + [ |
| self._internal.spark_column_for(label) for label in key_list |
| ] |
| index_names = self._internal.index_names + key_list |
| index_fields = self._internal.index_fields + [ |
| self._internal.field_for(label) for label in key_list |
| ] |
| else: |
| index_spark_columns = [self._internal.spark_column_for(label) for label in key_list] |
| index_names = key_list |
| index_fields = [self._internal.field_for(label) for label in key_list] |
| |
| internal = self._internal.copy( |
| index_spark_columns=index_spark_columns, |
| index_names=index_names, |
| index_fields=index_fields, |
| column_labels=column_labels, |
| data_spark_columns=[self._internal.spark_column_for(label) for label in column_labels], |
| data_fields=[self._internal.field_for(label) for label in column_labels], |
| ) |
| |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| |
| def reset_index( |
| self, |
| level: Optional[Union[int, Name, Sequence[Union[int, Name]]]] = None, |
| drop: bool = False, |
| inplace: bool = False, |
| col_level: int = 0, |
| col_fill: str = "", |
| ) -> Optional["DataFrame"]: |
| """Reset the index, or a level of it. |
| |
| For DataFrame with multi-level index, return new DataFrame with labeling information in |
| the columns under the index names, defaulting to 'level_0', 'level_1', etc. if any are None. |
| For a standard index, the index name will be used (if set), otherwise a default 'index' or |
| 'level_0' (if 'index' is already taken) will be used. |
| |
| Parameters |
| ---------- |
| level : int, str, tuple, or list, default None |
| Only remove the given levels from the index. Removes all levels by |
| default. |
| drop : bool, default False |
| Do not try to insert index into dataframe columns. This resets |
| the index to the default integer index. |
| inplace : bool, default False |
| Modify the DataFrame in place (do not create a new object). |
| col_level : int or str, default 0 |
| If the columns have multiple levels, determines which level the |
| labels are inserted into. By default it is inserted into the first |
| level. |
| col_fill : object, default '' |
| If the columns have multiple levels, determines how the other |
| levels are named. If None then the index name is repeated. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with the new index. |
| |
| See Also |
| -------- |
| DataFrame.set_index : Opposite of reset_index. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([('bird', 389.0), |
| ... ('bird', 24.0), |
| ... ('mammal', 80.5), |
| ... ('mammal', np.nan)], |
| ... index=['falcon', 'parrot', 'lion', 'monkey'], |
| ... columns=('class', 'max_speed')) |
| >>> df |
| class max_speed |
| falcon bird 389.0 |
| parrot bird 24.0 |
| lion mammal 80.5 |
| monkey mammal NaN |
| |
| When we reset the index, the old index is added as a column. Unlike pandas, pandas-on-Spark |
| does not automatically add a sequential index. The following 0, 1, 2, 3 are only |
| there when we display the DataFrame. |
| |
| >>> df.reset_index() |
| index class max_speed |
| 0 falcon bird 389.0 |
| 1 parrot bird 24.0 |
| 2 lion mammal 80.5 |
| 3 monkey mammal NaN |
| |
| We can use the `drop` parameter to avoid the old index being added as |
| a column: |
| |
| >>> df.reset_index(drop=True) |
| class max_speed |
| 0 bird 389.0 |
| 1 bird 24.0 |
| 2 mammal 80.5 |
| 3 mammal NaN |
| |
| You can also use `reset_index` with `MultiIndex`. |
| |
| >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), |
| ... ('bird', 'parrot'), |
| ... ('mammal', 'lion'), |
| ... ('mammal', 'monkey')], |
| ... names=['class', 'name']) |
| >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), |
| ... ('species', 'type')]) |
| >>> df = ps.DataFrame([(389.0, 'fly'), |
| ... ( 24.0, 'fly'), |
| ... ( 80.5, 'run'), |
| ... (np.nan, 'jump')], |
| ... index=index, |
| ... columns=columns) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| speed species |
| max type |
| class name |
| bird falcon 389.0 fly |
| parrot 24.0 fly |
| mammal lion 80.5 run |
| monkey NaN jump |
| |
| If the index has multiple levels, we can reset a subset of them: |
| |
| >>> df.reset_index(level='class') # doctest: +NORMALIZE_WHITESPACE |
| class speed species |
| max type |
| name |
| falcon bird 389.0 fly |
| parrot bird 24.0 fly |
| lion mammal 80.5 run |
| monkey mammal NaN jump |
| |
| If we are not dropping the index, by default, it is placed in the top |
| level. We can place it in another level: |
| |
| >>> df.reset_index(level='class', col_level=1) # doctest: +NORMALIZE_WHITESPACE |
| speed species |
| class max type |
| name |
| falcon bird 389.0 fly |
| parrot bird 24.0 fly |
| lion mammal 80.5 run |
| monkey mammal NaN jump |
| |
| When the index is inserted under another level, we can specify under |
| which one with the parameter `col_fill`: |
| |
| >>> df.reset_index(level='class', col_level=1, |
| ... col_fill='species') # doctest: +NORMALIZE_WHITESPACE |
| species speed species |
| class max type |
| name |
| falcon bird 389.0 fly |
| parrot bird 24.0 fly |
| lion mammal 80.5 run |
| monkey mammal NaN jump |
| |
| If we specify a nonexistent level for `col_fill`, it is created: |
| |
| >>> df.reset_index(level='class', col_level=1, |
| ... col_fill='genus') # doctest: +NORMALIZE_WHITESPACE |
| genus speed species |
| class max type |
| name |
| falcon bird 389.0 fly |
| parrot bird 24.0 fly |
| lion mammal 80.5 run |
| monkey mammal NaN jump |
| """ |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| multi_index = self._internal.index_level > 1 |
| |
| def rename(index: int) -> Label: |
| if multi_index: |
| return ("level_{}".format(index),) |
| else: |
| if ("index",) not in self._internal.column_labels: |
| return ("index",) |
| else: |
| return ("level_{}".format(index),) |
| |
| if level is None: |
| new_column_labels = [ |
| name if name is not None else rename(i) |
| for i, name in enumerate(self._internal.index_names) |
| ] |
| new_data_spark_columns = [ |
| scol.alias(name_like_string(label)) |
| for scol, label in zip(self._internal.index_spark_columns, new_column_labels) |
| ] |
| new_data_fields = self._internal.index_fields |
| |
| index_spark_columns = [] |
| index_names = [] |
| index_fields = [] |
| else: |
| if is_list_like(level): |
| level = list(cast(Sequence[Union[int, Name]], level)) |
| if isinstance(level, int) or is_name_like_tuple(level): |
| level_list = [cast(Union[int, Label], level)] |
| elif is_name_like_value(level): |
| level_list = [(level,)] |
| else: |
| level_list = [ |
| lvl if isinstance(lvl, int) or is_name_like_tuple(lvl) else (lvl,) |
| for lvl in level |
| ] |
| |
| if all(isinstance(l, int) for l in level_list): |
| int_level_list = cast(List[int], level_list) |
| for lev in int_level_list: |
| if lev >= self._internal.index_level: |
| raise IndexError( |
| "Too many levels: Index has only {} level, not {}".format( |
| self._internal.index_level, lev + 1 |
| ) |
| ) |
| idx = int_level_list |
| elif all(is_name_like_tuple(lev) for lev in level_list): |
| idx = [] |
| for l in cast(List[Label], level_list): |
| try: |
| i = self._internal.index_names.index(l) |
| idx.append(i) |
| except ValueError: |
| if multi_index: |
| raise KeyError("Level unknown not found") |
| else: |
| raise KeyError( |
| "Level unknown must be same as name ({})".format( |
| name_like_string(self._internal.index_names[0]) |
| ) |
| ) |
| else: |
| raise ValueError("Level should be all int or all string.") |
| idx.sort() |
| |
| new_column_labels = [] |
| new_data_spark_columns = [] |
| new_data_fields = [] |
| |
| index_spark_columns = self._internal.index_spark_columns.copy() |
| index_names = self._internal.index_names.copy() |
| index_fields = self._internal.index_fields.copy() |
| |
| for i in idx[::-1]: |
| name = index_names.pop(i) |
| new_column_labels.insert(0, name if name is not None else rename(i)) |
| |
| scol = index_spark_columns.pop(i) |
| new_data_spark_columns.insert(0, scol.alias(name_like_string(name))) |
| |
| new_data_fields.insert(0, index_fields.pop(i).copy(name=name_like_string(name))) |
| |
| if drop: |
| new_data_spark_columns = [] |
| new_column_labels = [] |
| new_data_fields = [] |
| |
| for label in new_column_labels: |
| if label in self._internal.column_labels: |
| raise ValueError("cannot insert {}, already exists".format(name_like_string(label))) |
| |
| if self._internal.column_labels_level > 1: |
| column_depth = len(self._internal.column_labels[0]) |
| if col_level >= column_depth: |
| raise IndexError( |
| "Too many levels: Index has only {} levels, not {}".format( |
| column_depth, col_level + 1 |
| ) |
| ) |
| if any(col_level + len(label) > column_depth for label in new_column_labels): |
| raise ValueError("Item must have length equal to number of levels.") |
| new_column_labels = [ |
| tuple( |
| ([col_fill] * col_level) |
| + list(label) |
| + ([col_fill] * (column_depth - (len(label) + col_level))) |
| ) |
| for label in new_column_labels |
| ] |
| |
| internal = self._internal.copy( |
| index_spark_columns=index_spark_columns, |
| index_names=index_names, |
| index_fields=index_fields, |
| column_labels=new_column_labels + self._internal.column_labels, |
| data_spark_columns=new_data_spark_columns + self._internal.data_spark_columns, |
| data_fields=new_data_fields + self._internal.data_fields, |
| ) |
| |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| |
| def isnull(self) -> "DataFrame": |
| """ |
| Detects missing values for items in the current Dataframe. |
| |
| Return a boolean same-sized Dataframe indicating if the values are NA. |
| NA values, such as None or numpy.NaN, gets mapped to True values. |
| Everything else gets mapped to False values. |
| |
| See Also |
| -------- |
| DataFrame.notnull |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) |
| >>> df.isnull() |
| 0 1 |
| 0 False False |
| 1 False True |
| 2 False True |
| 3 False False |
| |
| >>> df = ps.DataFrame([[None, 'bee', None], ['dog', None, 'fly']]) |
| >>> df.isnull() |
| 0 1 2 |
| 0 True False True |
| 1 False True False |
| """ |
| return self._apply_series_op(lambda psser: psser.isnull()) |
| |
| isna = isnull |
| |
| def notnull(self) -> "DataFrame": |
| """ |
| Detects non-missing values for items in the current Dataframe. |
| |
| This function takes a dataframe and indicates whether it's |
| values are valid (not missing, which is ``NaN`` in numeric |
| datatypes, ``None`` or ``NaN`` in objects and ``NaT`` in datetimelike). |
| |
| See Also |
| -------- |
| DataFrame.isnull |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([(.2, .3), (.0, None), (.6, None), (.2, .1)]) |
| >>> df.notnull() |
| 0 1 |
| 0 True True |
| 1 True False |
| 2 True False |
| 3 True True |
| |
| >>> df = ps.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']]) |
| >>> df.notnull() |
| 0 1 2 |
| 0 True True True |
| 1 True False True |
| """ |
| return self._apply_series_op(lambda psser: psser.notnull()) |
| |
| notna = notnull |
| |
| def insert( |
| self, |
| loc: int, |
| column: Name, |
| value: Union[Scalar, "Series", Iterable], |
| allow_duplicates: bool = False, |
| ) -> None: |
| """ |
| Insert column into DataFrame at specified location. |
| |
| Raises a ValueError if `column` is already contained in the DataFrame, |
| unless `allow_duplicates` is set to True. |
| |
| Parameters |
| ---------- |
| loc : int |
| Insertion index. Must verify 0 <= loc <= len(columns). |
| column : str, number, or hashable object |
| Label of the inserted column. |
| value : int, Series, or array-like |
| allow_duplicates : bool, optional |
| |
| Examples |
| -------- |
| >>> psdf = ps.DataFrame([1, 2, 3]) |
| >>> psdf.sort_index() |
| 0 |
| 0 1 |
| 1 2 |
| 2 3 |
| >>> psdf.insert(0, 'x', 4) |
| >>> psdf.sort_index() |
| x 0 |
| 0 4 1 |
| 1 4 2 |
| 2 4 3 |
| |
| >>> from pyspark.pandas.config import set_option, reset_option |
| >>> set_option("compute.ops_on_diff_frames", True) |
| |
| >>> psdf.insert(1, 'y', [5, 6, 7]) |
| >>> psdf.sort_index() |
| x y 0 |
| 0 4 5 1 |
| 1 4 6 2 |
| 2 4 7 3 |
| |
| >>> psdf.insert(2, 'z', ps.Series([8, 9, 10])) |
| >>> psdf.sort_index() |
| x y z 0 |
| 0 4 5 8 1 |
| 1 4 6 9 2 |
| 2 4 7 10 3 |
| |
| >>> reset_option("compute.ops_on_diff_frames") |
| """ |
| if not isinstance(loc, int): |
| raise TypeError("loc must be int") |
| |
| assert 0 <= loc <= len(self.columns) |
| assert allow_duplicates is False |
| |
| if not is_name_like_value(column): |
| raise TypeError( |
| '"column" should be a scalar value or tuple that contains scalar values' |
| ) |
| |
| if is_name_like_tuple(column): |
| if len(column) != len(self.columns.levels): |
| # To be consistent with pandas |
| raise ValueError('"column" must have length equal to number of column levels.') |
| |
| if column in self.columns: |
| raise ValueError("cannot insert %s, already exists" % column) |
| |
| psdf = self.copy() |
| psdf[column] = value |
| columns = psdf.columns[:-1].insert(loc, psdf.columns[-1]) |
| psdf = psdf[columns] |
| self._update_internal_frame(psdf._internal) |
| |
| # TODO: add frep and axis parameter |
| def shift(self, periods: int = 1, fill_value: Optional[Any] = None) -> "DataFrame": |
| """ |
| Shift DataFrame by desired number of periods. |
| |
| .. note:: the current implementation of shift uses Spark's Window without |
| specifying partition specification. This leads to move all data into |
| single partition in single machine and could cause serious |
| performance degradation. Avoid this method against very large dataset. |
| |
| Parameters |
| ---------- |
| periods : int |
| Number of periods to shift. Can be positive or negative. |
| fill_value : object, optional |
| The scalar value to use for newly introduced missing values. |
| The default depends on the dtype of self. For numeric data, np.nan is used. |
| |
| Returns |
| ------- |
| Copy of input DataFrame, shifted. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'Col1': [10, 20, 15, 30, 45], |
| ... 'Col2': [13, 23, 18, 33, 48], |
| ... 'Col3': [17, 27, 22, 37, 52]}, |
| ... columns=['Col1', 'Col2', 'Col3']) |
| |
| >>> df.shift(periods=3) |
| Col1 Col2 Col3 |
| 0 NaN NaN NaN |
| 1 NaN NaN NaN |
| 2 NaN NaN NaN |
| 3 10.0 13.0 17.0 |
| 4 20.0 23.0 27.0 |
| |
| >>> df.shift(periods=3, fill_value=0) |
| Col1 Col2 Col3 |
| 0 0 0 0 |
| 1 0 0 0 |
| 2 0 0 0 |
| 3 10 13 17 |
| 4 20 23 27 |
| |
| """ |
| return self._apply_series_op( |
| lambda psser: psser._shift(periods, fill_value), should_resolve=True |
| ) |
| |
| # TODO: axis should support 1 or 'columns' either at this moment |
| def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": |
| """ |
| First discrete difference of element. |
| |
| Calculates the difference of a DataFrame element compared with another element in the |
| DataFrame (default is the element in the same column of the previous row). |
| |
| .. note:: the current implementation of diff uses Spark's Window without |
| specifying partition specification. This leads to move all data into |
| single partition in single machine and could cause serious |
| performance degradation. Avoid this method against very large dataset. |
| |
| Parameters |
| ---------- |
| periods : int, default 1 |
| Periods to shift for calculating difference, accepts negative values. |
| axis : int, default 0 or 'index' |
| Can only be set to 0 at the moment. |
| |
| Returns |
| ------- |
| diffed : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6], |
| ... 'b': [1, 1, 2, 3, 5, 8], |
| ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c']) |
| >>> df |
| a b c |
| 0 1 1 1 |
| 1 2 1 4 |
| 2 3 2 9 |
| 3 4 3 16 |
| 4 5 5 25 |
| 5 6 8 36 |
| |
| >>> df.diff() |
| a b c |
| 0 NaN NaN NaN |
| 1 1.0 0.0 3.0 |
| 2 1.0 1.0 5.0 |
| 3 1.0 1.0 7.0 |
| 4 1.0 2.0 9.0 |
| 5 1.0 3.0 11.0 |
| |
| Difference with previous column |
| |
| >>> df.diff(periods=3) |
| a b c |
| 0 NaN NaN NaN |
| 1 NaN NaN NaN |
| 2 NaN NaN NaN |
| 3 3.0 2.0 15.0 |
| 4 3.0 4.0 21.0 |
| 5 3.0 6.0 27.0 |
| |
| Difference with following row |
| |
| >>> df.diff(periods=-1) |
| a b c |
| 0 -1.0 0.0 -3.0 |
| 1 -1.0 -1.0 -5.0 |
| 2 -1.0 -1.0 -7.0 |
| 3 -1.0 -2.0 -9.0 |
| 4 -1.0 -3.0 -11.0 |
| 5 NaN NaN NaN |
| """ |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| return self._apply_series_op(lambda psser: psser._diff(periods), should_resolve=True) |
| |
| # TODO: axis should support 1 or 'columns' either at this moment |
| def nunique( |
| self, |
| axis: Axis = 0, |
| dropna: bool = True, |
| approx: bool = False, |
| rsd: float = 0.05, |
| ) -> "Series": |
| """ |
| Return number of unique elements in the object. |
| |
| Excludes NA values by default. |
| |
| Parameters |
| ---------- |
| axis : int, default 0 or 'index' |
| Can only be set to 0 at the moment. |
| dropna : bool, default True |
| Don’t include NaN in the count. |
| approx: bool, default False |
| If False, will use the exact algorithm and return the exact number of unique. |
| If True, it uses the HyperLogLog approximate algorithm, which is significantly faster |
| for large amount of data. |
| Note: This parameter is specific to pandas-on-Spark and is not found in pandas. |
| rsd: float, default 0.05 |
| Maximum estimation error allowed in the HyperLogLog algorithm. |
| Note: Just like ``approx`` this parameter is specific to pandas-on-Spark. |
| |
| Returns |
| ------- |
| The number of unique values per column as a pandas-on-Spark Series. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [np.nan, 3, np.nan]}) |
| >>> df.nunique() |
| A 3 |
| B 1 |
| dtype: int64 |
| |
| >>> df.nunique(dropna=False) |
| A 3 |
| B 2 |
| dtype: int64 |
| |
| On big data, we recommend using the approximate algorithm to speed up this function. |
| The result will be very close to the exact unique count. |
| |
| >>> df.nunique(approx=True) |
| A 3 |
| B 1 |
| dtype: int64 |
| """ |
| from pyspark.pandas.series import first_series |
| |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| sdf = self._internal.spark_frame.select( |
| [SF.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)] |
| + [ |
| self._psser_for(label)._nunique(dropna, approx, rsd) |
| for label in self._internal.column_labels |
| ] |
| ) |
| |
| # The data is expected to be small so it's fine to transpose/use default index. |
| with ps.option_context("compute.max_rows", 1): |
| internal = self._internal.copy( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], |
| index_names=[None], |
| index_fields=[None], |
| data_spark_columns=[ |
| scol_for(sdf, col) for col in self._internal.data_spark_column_names |
| ], |
| data_fields=None, |
| ) |
| return first_series(DataFrame(internal).transpose()) |
| |
| def round(self, decimals: Union[int, Dict[Name, int], "Series"] = 0) -> "DataFrame": |
| """ |
| Round a DataFrame to a variable number of decimal places. |
| |
| Parameters |
| ---------- |
| decimals : int, dict, Series |
| Number of decimal places to round each column to. If an int is |
| given, round each column to the same number of places. |
| Otherwise dict and Series round to variable numbers of places. |
| Column names should be in the keys if `decimals` is a |
| dict-like, or in the index if `decimals` is a Series. Any |
| columns not included in `decimals` will be left as is. Elements |
| of `decimals` which are not columns of the input will be |
| ignored. |
| |
| .. note:: If `decimals` is a Series, it is expected to be small, |
| as all the data is loaded into the driver's memory. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| See Also |
| -------- |
| Series.round |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A':[0.028208, 0.038683, 0.877076], |
| ... 'B':[0.992815, 0.645646, 0.149370], |
| ... 'C':[0.173891, 0.577595, 0.491027]}, |
| ... columns=['A', 'B', 'C'], |
| ... index=['first', 'second', 'third']) |
| >>> df |
| A B C |
| first 0.028208 0.992815 0.173891 |
| second 0.038683 0.645646 0.577595 |
| third 0.877076 0.149370 0.491027 |
| |
| >>> df.round(2) |
| A B C |
| first 0.03 0.99 0.17 |
| second 0.04 0.65 0.58 |
| third 0.88 0.15 0.49 |
| |
| >>> df.round({'A': 1, 'C': 2}) |
| A B C |
| first 0.0 0.992815 0.17 |
| second 0.0 0.645646 0.58 |
| third 0.9 0.149370 0.49 |
| |
| >>> decimals = ps.Series([1, 0, 2], index=['A', 'B', 'C']) |
| >>> df.round(decimals) |
| A B C |
| first 0.0 1.0 0.17 |
| second 0.0 1.0 0.58 |
| third 0.9 0.0 0.49 |
| """ |
| if isinstance(decimals, ps.Series): |
| decimals_dict = { |
| k if isinstance(k, tuple) else (k,): v |
| for k, v in decimals._to_internal_pandas().items() |
| } |
| elif isinstance(decimals, dict): |
| decimals_dict = {k if is_name_like_tuple(k) else (k,): v for k, v in decimals.items()} |
| elif isinstance(decimals, int): |
| decimals_dict = {k: decimals for k in self._internal.column_labels} |
| else: |
| raise TypeError("decimals must be an integer, a dict-like or a Series") |
| |
| def op(psser: ps.Series) -> Union[ps.Series, Column]: |
| label = psser._column_label |
| if label in decimals_dict: |
| return F.round(psser.spark.column, decimals_dict[label]) |
| else: |
| return psser |
| |
| return self._apply_series_op(op) |
| |
| def _mark_duplicates( |
| self, |
| subset: Optional[Union[Name, List[Name]]] = None, |
| keep: str = "first", |
| ) -> Tuple[SparkDataFrame, str]: |
| if subset is None: |
| subset_list = self._internal.column_labels |
| else: |
| if is_name_like_tuple(subset): |
| subset_list = [cast(Label, subset)] |
| elif is_name_like_value(subset): |
| subset_list = [(subset,)] |
| else: |
| subset_list = [sub if is_name_like_tuple(sub) else (sub,) for sub in subset] |
| diff = set(subset_list).difference(set(self._internal.column_labels)) |
| if len(diff) > 0: |
| raise KeyError(", ".join([name_like_string(d) for d in diff])) |
| group_cols = [self._internal.spark_column_name_for(label) for label in subset_list] |
| |
| sdf = self._internal.resolved_copy.spark_frame |
| |
| column = verify_temp_column_name(sdf, "__duplicated__") |
| |
| if keep == "first" or keep == "last": |
| if keep == "first": |
| ord_func = F.asc |
| else: |
| ord_func = F.desc |
| window = ( |
| Window.partitionBy(*group_cols) |
| .orderBy(ord_func(NATURAL_ORDER_COLUMN_NAME)) |
| .rowsBetween(Window.unboundedPreceding, Window.currentRow) |
| ) |
| sdf = sdf.withColumn(column, F.row_number().over(window) > 1) |
| elif not keep: |
| window = Window.partitionBy(*group_cols).rowsBetween( |
| Window.unboundedPreceding, Window.unboundedFollowing |
| ) |
| sdf = sdf.withColumn(column, F.count("*").over(window) > 1) |
| else: |
| raise ValueError("'keep' only supports 'first', 'last' and False") |
| return sdf, column |
| |
| def duplicated( |
| self, |
| subset: Optional[Union[Name, List[Name]]] = None, |
| keep: str = "first", |
| ) -> "Series": |
| """ |
| Return boolean Series denoting duplicate rows, optionally only considering certain columns. |
| |
| Parameters |
| ---------- |
| subset : column label or sequence of labels, optional |
| Only consider certain columns for identifying duplicates, |
| by default use all of the columns |
| keep : {'first', 'last', False}, default 'first' |
| - ``first`` : Mark duplicates as ``True`` except for the first occurrence. |
| - ``last`` : Mark duplicates as ``True`` except for the last occurrence. |
| - False : Mark all duplicates as ``True``. |
| |
| Returns |
| ------- |
| duplicated : Series |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': [1, 1, 1, 3], 'b': [1, 1, 1, 4], 'c': [1, 1, 1, 5]}, |
| ... columns = ['a', 'b', 'c']) |
| >>> df |
| a b c |
| 0 1 1 1 |
| 1 1 1 1 |
| 2 1 1 1 |
| 3 3 4 5 |
| |
| >>> df.duplicated().sort_index() |
| 0 False |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| |
| Mark duplicates as ``True`` except for the last occurrence. |
| |
| >>> df.duplicated(keep='last').sort_index() |
| 0 True |
| 1 True |
| 2 False |
| 3 False |
| dtype: bool |
| |
| Mark all duplicates as ``True``. |
| |
| >>> df.duplicated(keep=False).sort_index() |
| 0 True |
| 1 True |
| 2 True |
| 3 False |
| dtype: bool |
| """ |
| from pyspark.pandas.series import first_series |
| |
| sdf, column = self._mark_duplicates(subset, keep) |
| |
| sdf = sdf.select( |
| self._internal.index_spark_columns |
| + [scol_for(sdf, column).alias(SPARK_DEFAULT_SERIES_NAME)] |
| ) |
| return first_series( |
| DataFrame( |
| InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, col) for col in self._internal.index_spark_column_names |
| ], |
| index_names=self._internal.index_names, |
| index_fields=self._internal.index_fields, |
| column_labels=[None], # type: ignore |
| data_spark_columns=[scol_for(sdf, SPARK_DEFAULT_SERIES_NAME)], |
| ) |
| ) |
| ) |
| |
| # TODO: support other as DataFrame or array-like |
| def dot(self, other: "Series") -> "Series": |
| """ |
| Compute the matrix multiplication between the DataFrame and other. |
| |
| This method computes the matrix product between the DataFrame and the |
| values of an other Series |
| |
| It can also be called using ``self @ other`` in Python >= 3.5. |
| |
| .. note:: This method is based on an expensive operation due to the nature |
| of big data. Internally it needs to generate each row for each value, and |
| then group twice - it is a huge operation. To prevent misusage, this method |
| has the 'compute.max_rows' default limit of input length, and raises a ValueError. |
| |
| >>> from pyspark.pandas.config import option_context |
| >>> with option_context( |
| ... 'compute.max_rows', 1000, "compute.ops_on_diff_frames", True |
| ... ): # doctest: +NORMALIZE_WHITESPACE |
| ... psdf = ps.DataFrame({'a': range(1001)}) |
| ... psser = ps.Series([2], index=['a']) |
| ... psdf.dot(psser) |
| Traceback (most recent call last): |
| ... |
| ValueError: Current DataFrame has more then the given limit 1000 rows. |
| Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' |
| to retrieve to retrieve more than 1000 rows. Note that, before changing the |
| 'compute.max_rows', this operation is considerably expensive. |
| |
| Parameters |
| ---------- |
| other : Series |
| The other object to compute the matrix product with. |
| |
| Returns |
| ------- |
| Series |
| Return the matrix product between self and other as a Series. |
| |
| See Also |
| -------- |
| Series.dot: Similar method for Series. |
| |
| Notes |
| ----- |
| The dimensions of DataFrame and other must be compatible in order to |
| compute the matrix multiplication. In addition, the column names of |
| DataFrame and the index of other must contain the same values, as they |
| will be aligned prior to the multiplication. |
| |
| The dot method for Series computes the inner product, instead of the |
| matrix product here. |
| |
| Examples |
| -------- |
| >>> from pyspark.pandas.config import set_option, reset_option |
| >>> set_option("compute.ops_on_diff_frames", True) |
| >>> psdf = ps.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) |
| >>> psser = ps.Series([1, 1, 2, 1]) |
| >>> psdf.dot(psser) |
| 0 -4 |
| 1 5 |
| dtype: int64 |
| |
| Note how shuffling of the objects does not change the result. |
| |
| >>> psser2 = psser.reindex([1, 0, 2, 3]) |
| >>> psdf.dot(psser2) |
| 0 -4 |
| 1 5 |
| dtype: int64 |
| >>> psdf @ psser2 |
| 0 -4 |
| 1 5 |
| dtype: int64 |
| >>> reset_option("compute.ops_on_diff_frames") |
| """ |
| if not isinstance(other, ps.Series): |
| raise TypeError("Unsupported type {}".format(type(other).__name__)) |
| else: |
| return cast(ps.Series, other.dot(self.transpose())).rename(None) |
| |
| def __matmul__(self, other: "Series") -> "Series": |
| """ |
| Matrix multiplication using binary `@` operator in Python>=3.5. |
| """ |
| return self.dot(other) |
| |
| def to_table( |
| self, |
| name: str, |
| format: Optional[str] = None, |
| mode: str = "w", |
| partition_cols: Optional[Union[str, List[str]]] = None, |
| index_col: Optional[Union[str, List[str]]] = None, |
| **options: Any |
| ) -> None: |
| mode = validate_mode(mode) |
| return self.spark.to_table(name, format, mode, partition_cols, index_col, **options) |
| |
| to_table.__doc__ = SparkFrameMethods.to_table.__doc__ |
| |
| def to_delta( |
| self, |
| path: str, |
| mode: str = "w", |
| partition_cols: Optional[Union[str, List[str]]] = None, |
| index_col: Optional[Union[str, List[str]]] = None, |
| **options: "OptionalPrimitiveType" |
| ) -> None: |
| """ |
| Write the DataFrame out as a Delta Lake table. |
| |
| Parameters |
| ---------- |
| path : str, required |
| Path to write to. |
| mode : str |
| Python write mode, default 'w'. |
| |
| .. note:: mode can accept the strings for Spark writing mode. |
| Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. |
| |
| - 'append' (equivalent to 'a'): Append the new data to existing data. |
| - 'overwrite' (equivalent to 'w'): Overwrite existing data. |
| - 'ignore': Silently ignore this operation if data already exists. |
| - 'error' or 'errorifexists': Throw an exception if data already exists. |
| |
| partition_cols : str or list of str, optional, default None |
| Names of partitioning columns |
| index_col: str or list of str, optional, default: None |
| Column names to be used in Spark to represent pandas-on-Spark's index. The index name |
| in pandas-on-Spark is ignored. By default, the index is always lost. |
| options : dict |
| All other options passed directly into Delta Lake. |
| |
| See Also |
| -------- |
| read_delta |
| DataFrame.to_parquet |
| DataFrame.to_table |
| DataFrame.to_spark_io |
| |
| Examples |
| -------- |
| |
| >>> df = ps.DataFrame(dict( |
| ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), |
| ... country=['KR', 'US', 'JP'], |
| ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) |
| >>> df |
| date country code |
| 0 2012-01-31 12:00:00 KR 1 |
| 1 2012-02-29 12:00:00 US 2 |
| 2 2012-03-31 12:00:00 JP 3 |
| |
| Create a new Delta Lake table, partitioned by one column: |
| |
| >>> df.to_delta('%s/to_delta/foo' % path, partition_cols='date') # doctest: +SKIP |
| |
| Partitioned by two columns: |
| |
| >>> df.to_delta('%s/to_delta/bar' % path, |
| ... partition_cols=['date', 'country']) # doctest: +SKIP |
| |
| Overwrite an existing table's partitions, using the 'replaceWhere' capability in Delta: |
| |
| >>> df.to_delta('%s/to_delta/bar' % path, |
| ... mode='overwrite', replaceWhere='date >= "2012-01-01"') # doctest: +SKIP |
| """ |
| if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: |
| options = options.get("options") # type: ignore |
| |
| mode = validate_mode(mode) |
| self.spark.to_spark_io( |
| path=path, |
| mode=mode, |
| format="delta", |
| partition_cols=partition_cols, |
| index_col=index_col, |
| **options, |
| ) |
| |
| def to_parquet( |
| self, |
| path: str, |
| mode: str = "w", |
| partition_cols: Optional[Union[str, List[str]]] = None, |
| compression: Optional[str] = None, |
| index_col: Optional[Union[str, List[str]]] = None, |
| **options: Any |
| ) -> None: |
| """ |
| Write the DataFrame out as a Parquet file or directory. |
| |
| Parameters |
| ---------- |
| path : str, required |
| Path to write to. |
| mode : str |
| Python write mode, default 'w'. |
| |
| .. note:: mode can accept the strings for Spark writing mode. |
| Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. |
| |
| - 'append' (equivalent to 'a'): Append the new data to existing data. |
| - 'overwrite' (equivalent to 'w'): Overwrite existing data. |
| - 'ignore': Silently ignore this operation if data already exists. |
| - 'error' or 'errorifexists': Throw an exception if data already exists. |
| |
| partition_cols : str or list of str, optional, default None |
| Names of partitioning columns |
| compression : str {'none', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'lz4', 'zstd'} |
| Compression codec to use when saving to file. If None is set, it uses the |
| value specified in `spark.sql.parquet.compression.codec`. |
| index_col: str or list of str, optional, default: None |
| Column names to be used in Spark to represent pandas-on-Spark's index. The index name |
| in pandas-on-Spark is ignored. By default, the index is always lost. |
| options : dict |
| All other options passed directly into Spark's data source. |
| |
| See Also |
| -------- |
| read_parquet |
| DataFrame.to_delta |
| DataFrame.to_table |
| DataFrame.to_spark_io |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame(dict( |
| ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), |
| ... country=['KR', 'US', 'JP'], |
| ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) |
| >>> df |
| date country code |
| 0 2012-01-31 12:00:00 KR 1 |
| 1 2012-02-29 12:00:00 US 2 |
| 2 2012-03-31 12:00:00 JP 3 |
| |
| >>> df.to_parquet('%s/to_parquet/foo.parquet' % path, partition_cols='date') |
| |
| >>> df.to_parquet( |
| ... '%s/to_parquet/foo.parquet' % path, |
| ... mode = 'overwrite', |
| ... partition_cols=['date', 'country']) |
| """ |
| if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: |
| options = options.get("options") # type: ignore |
| |
| mode = validate_mode(mode) |
| builder = self.to_spark(index_col=index_col).write.mode(mode) |
| if partition_cols is not None: |
| builder.partitionBy(partition_cols) |
| if compression is not None: |
| builder.option("compression", compression) |
| builder.options(**options).format("parquet").save(path) |
| |
| def to_orc( |
| self, |
| path: str, |
| mode: str = "w", |
| partition_cols: Optional[Union[str, List[str]]] = None, |
| index_col: Optional[Union[str, List[str]]] = None, |
| **options: "OptionalPrimitiveType" |
| ) -> None: |
| """ |
| Write the DataFrame out as a ORC file or directory. |
| |
| Parameters |
| ---------- |
| path : str, required |
| Path to write to. |
| mode : str |
| Python write mode, default 'w'. |
| |
| .. note:: mode can accept the strings for Spark writing mode. |
| Such as 'append', 'overwrite', 'ignore', 'error', 'errorifexists'. |
| |
| - 'append' (equivalent to 'a'): Append the new data to existing data. |
| - 'overwrite' (equivalent to 'w'): Overwrite existing data. |
| - 'ignore': Silently ignore this operation if data already exists. |
| - 'error' or 'errorifexists': Throw an exception if data already exists. |
| |
| partition_cols : str or list of str, optional, default None |
| Names of partitioning columns |
| index_col: str or list of str, optional, default: None |
| Column names to be used in Spark to represent pandas-on-Spark's index. The index name |
| in pandas-on-Spark is ignored. By default, the index is always lost. |
| options : dict |
| All other options passed directly into Spark's data source. |
| |
| See Also |
| -------- |
| read_orc |
| DataFrame.to_delta |
| DataFrame.to_parquet |
| DataFrame.to_table |
| DataFrame.to_spark_io |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame(dict( |
| ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')), |
| ... country=['KR', 'US', 'JP'], |
| ... code=[1, 2 ,3]), columns=['date', 'country', 'code']) |
| >>> df |
| date country code |
| 0 2012-01-31 12:00:00 KR 1 |
| 1 2012-02-29 12:00:00 US 2 |
| 2 2012-03-31 12:00:00 JP 3 |
| |
| >>> df.to_orc('%s/to_orc/foo.orc' % path, partition_cols='date') |
| |
| >>> df.to_orc( |
| ... '%s/to_orc/foo.orc' % path, |
| ... mode = 'overwrite', |
| ... partition_cols=['date', 'country']) |
| """ |
| if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1: |
| options = options.get("options") # type: ignore |
| |
| mode = validate_mode(mode) |
| self.spark.to_spark_io( |
| path=path, |
| mode=mode, |
| format="orc", |
| partition_cols=partition_cols, |
| index_col=index_col, |
| **options, |
| ) |
| |
| def to_spark_io( |
| self, |
| path: Optional[str] = None, |
| format: Optional[str] = None, |
| mode: str = "overwrite", |
| partition_cols: Optional[Union[str, List[str]]] = None, |
| index_col: Optional[Union[str, List[str]]] = None, |
| **options: "OptionalPrimitiveType" |
| ) -> None: |
| """An alias for :func:`DataFrame.spark.to_spark_io`. |
| See :meth:`pyspark.pandas.spark.accessors.SparkFrameMethods.to_spark_io`. |
| |
| .. deprecated:: 3.2.0 |
| Use :func:`DataFrame.spark.to_spark_io` instead. |
| """ |
| warnings.warn("Deprecated in 3.2, Use spark.to_spark_io instead.", FutureWarning) |
| return self.spark.to_spark_io(path, format, mode, partition_cols, index_col, **options) |
| |
| to_spark_io.__doc__ = SparkFrameMethods.to_spark_io.__doc__ |
| |
| def to_spark(self, index_col: Optional[Union[str, List[str]]] = None) -> SparkDataFrame: |
| return self.spark.frame(index_col) |
| |
| to_spark.__doc__ = SparkFrameMethods.__doc__ |
| |
| def to_pandas(self) -> pd.DataFrame: |
| """ |
| Return a pandas DataFrame. |
| |
| .. note:: This method should only be used if the resulting pandas DataFrame is expected |
| to be small, as all the data is loaded into the driver's memory. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], |
| ... columns=['dogs', 'cats']) |
| >>> df.to_pandas() |
| dogs cats |
| 0 0.2 0.3 |
| 1 0.0 0.6 |
| 2 0.6 0.0 |
| 3 0.2 0.1 |
| """ |
| return self._internal.to_pandas_frame.copy() |
| |
| def assign(self, **kwargs: Any) -> "DataFrame": |
| """ |
| Assign new columns to a DataFrame. |
| |
| Returns a new object with all original columns in addition to new ones. |
| Existing columns that are re-assigned will be overwritten. |
| |
| Parameters |
| ---------- |
| **kwargs : dict of {str: callable, Series or Index} |
| The column names are keywords. If the values are |
| callable, they are computed on the DataFrame and |
| assigned to the new columns. The callable must not |
| change input DataFrame (though pandas-on-Spark doesn't check it). |
| If the values are not callable, (e.g. a Series or a literal), |
| they are simply assigned. |
| |
| Returns |
| ------- |
| DataFrame |
| A new DataFrame with the new columns in addition to |
| all the existing columns. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'temp_c': [17.0, 25.0]}, |
| ... index=['Portland', 'Berkeley']) |
| >>> df |
| temp_c |
| Portland 17.0 |
| Berkeley 25.0 |
| |
| Where the value is a callable, evaluated on `df`: |
| |
| >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) |
| temp_c temp_f |
| Portland 17.0 62.6 |
| Berkeley 25.0 77.0 |
| |
| Alternatively, the same behavior can be achieved by directly |
| referencing an existing Series or sequence and you can also |
| create multiple columns within the same assign. |
| |
| >>> assigned = df.assign(temp_f=df['temp_c'] * 9 / 5 + 32, |
| ... temp_k=df['temp_c'] + 273.15, |
| ... temp_idx=df.index) |
| >>> assigned[['temp_c', 'temp_f', 'temp_k', 'temp_idx']] |
| temp_c temp_f temp_k temp_idx |
| Portland 17.0 62.6 290.15 Portland |
| Berkeley 25.0 77.0 298.15 Berkeley |
| |
| Notes |
| ----- |
| Assigning multiple columns within the same ``assign`` is possible |
| but you cannot refer to newly created or modified columns. This |
| feature is supported in pandas for Python 3.6 and later but not in |
| pandas-on-Spark. In pandas-on-Spark, all items are computed first, |
| and then assigned. |
| """ |
| return self._assign(kwargs) |
| |
| def _assign(self, kwargs: Any) -> "DataFrame": |
| assert isinstance(kwargs, dict) |
| from pyspark.pandas.indexes import MultiIndex |
| from pyspark.pandas.series import IndexOpsMixin |
| |
| for k, v in kwargs.items(): |
| is_invalid_assignee = ( |
| not (isinstance(v, (IndexOpsMixin, Column)) or callable(v) or is_scalar(v)) |
| ) or isinstance(v, MultiIndex) |
| if is_invalid_assignee: |
| raise TypeError( |
| "Column assignment doesn't support type " "{0}".format(type(v).__name__) |
| ) |
| if callable(v): |
| kwargs[k] = v(self) |
| |
| pairs = { |
| (k if is_name_like_tuple(k) else (k,)): ( |
| (v.spark.column, v._internal.data_fields[0]) |
| if isinstance(v, IndexOpsMixin) and not isinstance(v, MultiIndex) |
| else (v, None) |
| if isinstance(v, Column) |
| else (SF.lit(v), None) |
| ) |
| for k, v in kwargs.items() |
| } |
| |
| scols = [] |
| data_fields = [] |
| for label in self._internal.column_labels: |
| for i in range(len(label)): |
| if label[: len(label) - i] in pairs: |
| scol, field = pairs[label[: len(label) - i]] |
| |
| name = self._internal.spark_column_name_for(label) |
| scol = scol.alias(name) |
| if field is not None: |
| field = field.copy(name=name) |
| break |
| else: |
| scol = self._internal.spark_column_for(label) |
| field = self._internal.field_for(label) |
| scols.append(scol) |
| data_fields.append(field) |
| |
| column_labels = self._internal.column_labels.copy() |
| for label, (scol, field) in pairs.items(): |
| if label not in set(i[: len(label)] for i in self._internal.column_labels): |
| name = name_like_string(label) |
| scols.append(scol.alias(name)) |
| if field is not None: |
| field = field.copy(name=name) |
| data_fields.append(field) |
| |
| column_labels.append(label) |
| |
| level = self._internal.column_labels_level |
| column_labels = [ |
| tuple(list(label) + ([""] * (level - len(label)))) for label in column_labels |
| ] |
| |
| internal = self._internal.with_new_columns( |
| scols, column_labels=column_labels, data_fields=data_fields |
| ) |
| return DataFrame(internal) |
| |
| @staticmethod |
| def from_records( |
| data: Union[np.ndarray, List[tuple], dict, pd.DataFrame], |
| index: Union[str, list, np.ndarray] = None, |
| exclude: list = None, |
| columns: list = None, |
| coerce_float: bool = False, |
| nrows: int = None, |
| ) -> "DataFrame": |
| """ |
| Convert structured or record ndarray to DataFrame. |
| |
| Parameters |
| ---------- |
| data : ndarray (structured dtype), list of tuples, dict, or DataFrame |
| index : string, list of fields, array-like |
| Field of array to use as the index, alternately a specific set of input labels to use |
| exclude : sequence, default None |
| Columns or fields to exclude |
| columns : sequence, default None |
| Column names to use. If the passed data do not have names associated with them, this |
| argument provides names for the columns. Otherwise this argument indicates the order of |
| the columns in the result (any names not found in the data will become all-NA columns) |
| coerce_float : boolean, default False |
| Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to |
| floating point, useful for SQL result sets |
| nrows : int, default None |
| Number of rows to read if data is an iterator |
| |
| Returns |
| ------- |
| df : DataFrame |
| |
| Examples |
| -------- |
| Use dict as input |
| |
| >>> ps.DataFrame.from_records({'A': [1, 2, 3]}) |
| A |
| 0 1 |
| 1 2 |
| 2 3 |
| |
| Use list of tuples as input |
| |
| >>> ps.DataFrame.from_records([(1, 2), (3, 4)]) |
| 0 1 |
| 0 1 2 |
| 1 3 4 |
| |
| Use NumPy array as input |
| |
| >>> ps.DataFrame.from_records(np.eye(3)) |
| 0 1 2 |
| 0 1.0 0.0 0.0 |
| 1 0.0 1.0 0.0 |
| 2 0.0 0.0 1.0 |
| """ |
| return DataFrame( |
| pd.DataFrame.from_records(data, index, exclude, columns, coerce_float, nrows) |
| ) |
| |
| def to_records( |
| self, |
| index: bool = True, |
| column_dtypes: Optional[Union[str, Dtype, Dict[Name, Union[str, Dtype]]]] = None, |
| index_dtypes: Optional[Union[str, Dtype, Dict[Name, Union[str, Dtype]]]] = None, |
| ) -> np.recarray: |
| """ |
| Convert DataFrame to a NumPy record array. |
| |
| Index will be included as the first field of the record array if |
| requested. |
| |
| .. note:: This method should only be used if the resulting NumPy ndarray is |
| expected to be small, as all the data is loaded into the driver's memory. |
| |
| Parameters |
| ---------- |
| index : bool, default True |
| Include index in resulting record array, stored in 'index' |
| field or using the index label, if set. |
| column_dtypes : str, type, dict, default None |
| If a string or type, the data type to store all columns. If |
| a dictionary, a mapping of column names and indices (zero-indexed) |
| to specific data types. |
| index_dtypes : str, type, dict, default None |
| If a string or type, the data type to store all index levels. If |
| a dictionary, a mapping of index level names and indices |
| (zero-indexed) to specific data types. |
| This mapping is applied only if `index=True`. |
| |
| Returns |
| ------- |
| numpy.recarray |
| NumPy ndarray with the DataFrame labels as fields and each row |
| of the DataFrame as entries. |
| |
| See Also |
| -------- |
| DataFrame.from_records: Convert structured or record ndarray |
| to DataFrame. |
| numpy.recarray: An ndarray that allows field access using |
| attributes, analogous to typed columns in a |
| spreadsheet. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, |
| ... index=['a', 'b']) |
| >>> df |
| A B |
| a 1 0.50 |
| b 2 0.75 |
| |
| >>> df.to_records() # doctest: +SKIP |
| rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], |
| dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) |
| |
| The index can be excluded from the record array: |
| |
| >>> df.to_records(index=False) # doctest: +SKIP |
| rec.array([(1, 0.5 ), (2, 0.75)], |
| dtype=[('A', '<i8'), ('B', '<f8')]) |
| |
| Specification of dtype for columns is new in pandas 0.24.0. |
| Data types can be specified for the columns: |
| |
| >>> df.to_records(column_dtypes={"A": "int32"}) # doctest: +SKIP |
| rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], |
| dtype=[('index', 'O'), ('A', '<i4'), ('B', '<f8')]) |
| |
| Specification of dtype for index is new in pandas 0.24.0. |
| Data types can also be specified for the index: |
| |
| >>> df.to_records(index_dtypes="<S2") # doctest: +SKIP |
| rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], |
| dtype=[('index', 'S2'), ('A', '<i8'), ('B', '<f8')]) |
| """ |
| args = locals() |
| psdf = self |
| |
| return validate_arguments_and_invoke_function( |
| psdf._to_internal_pandas(), self.to_records, pd.DataFrame.to_records, args |
| ) |
| |
| def copy(self, deep: bool = True) -> "DataFrame": |
| """ |
| Make a copy of this object's indices and data. |
| |
| Parameters |
| ---------- |
| deep : bool, default True |
| this parameter is not supported but just dummy parameter to match pandas. |
| |
| Returns |
| ------- |
| copy : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]}, |
| ... columns=['x', 'y', 'z', 'w']) |
| >>> df |
| x y z w |
| 0 1 3 5 7 |
| 1 2 4 6 8 |
| >>> df_copy = df.copy() |
| >>> df_copy |
| x y z w |
| 0 1 3 5 7 |
| 1 2 4 6 8 |
| """ |
| return DataFrame(self._internal) |
| |
| def dropna( |
| self, |
| axis: Axis = 0, |
| how: str = "any", |
| thresh: Optional[int] = None, |
| subset: Optional[Union[Name, List[Name]]] = None, |
| inplace: bool = False, |
| ) -> Optional["DataFrame"]: |
| """ |
| Remove missing values. |
| |
| Parameters |
| ---------- |
| axis : {0 or 'index'}, default 0 |
| Determine if rows or columns which contain missing values are |
| removed. |
| |
| * 0, or 'index' : Drop rows which contain missing values. |
| how : {'any', 'all'}, default 'any' |
| Determine if row or column is removed from DataFrame, when we have |
| at least one NA or all NA. |
| |
| * 'any' : If any NA values are present, drop that row or column. |
| * 'all' : If all values are NA, drop that row or column. |
| |
| thresh : int, optional |
| Require that many non-NA values. |
| subset : array-like, optional |
| Labels along other axis to consider, e.g. if you are dropping rows |
| these would be a list of columns to include. |
| inplace : bool, default False |
| If True, do operation inplace and return None. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with NA entries dropped from it. |
| |
| See Also |
| -------- |
| DataFrame.drop : Drop specified labels from columns. |
| DataFrame.isnull: Indicate missing values. |
| DataFrame.notnull : Indicate existing (non-missing) values. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], |
| ... "toy": [None, 'Batmobile', 'Bullwhip'], |
| ... "born": [None, "1940-04-25", None]}, |
| ... columns=['name', 'toy', 'born']) |
| >>> df |
| name toy born |
| 0 Alfred None None |
| 1 Batman Batmobile 1940-04-25 |
| 2 Catwoman Bullwhip None |
| |
| Drop the rows where at least one element is missing. |
| |
| >>> df.dropna() |
| name toy born |
| 1 Batman Batmobile 1940-04-25 |
| |
| Drop the columns where at least one element is missing. |
| |
| >>> df.dropna(axis='columns') |
| name |
| 0 Alfred |
| 1 Batman |
| 2 Catwoman |
| |
| Drop the rows where all elements are missing. |
| |
| >>> df.dropna(how='all') |
| name toy born |
| 0 Alfred None None |
| 1 Batman Batmobile 1940-04-25 |
| 2 Catwoman Bullwhip None |
| |
| Keep only the rows with at least 2 non-NA values. |
| |
| >>> df.dropna(thresh=2) |
| name toy born |
| 1 Batman Batmobile 1940-04-25 |
| 2 Catwoman Bullwhip None |
| |
| Define in which columns to look for missing values. |
| |
| >>> df.dropna(subset=['name', 'born']) |
| name toy born |
| 1 Batman Batmobile 1940-04-25 |
| |
| Keep the DataFrame with valid entries in the same variable. |
| |
| >>> df.dropna(inplace=True) |
| >>> df |
| name toy born |
| 1 Batman Batmobile 1940-04-25 |
| """ |
| axis = validate_axis(axis) |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| |
| if thresh is None: |
| if how is None: |
| raise TypeError("must specify how or thresh") |
| elif how not in ("any", "all"): |
| raise ValueError("invalid how option: {h}".format(h=how)) |
| |
| if subset is not None: |
| if isinstance(subset, str): |
| labels = [(subset,)] # type: Optional[List[Label]] |
| elif isinstance(subset, tuple): |
| labels = [subset] |
| else: |
| labels = [sub if isinstance(sub, tuple) else (sub,) for sub in subset] |
| else: |
| labels = None |
| |
| if axis == 0: |
| if labels is not None: |
| invalids = [label for label in labels if label not in self._internal.column_labels] |
| if len(invalids) > 0: |
| raise KeyError(invalids) |
| else: |
| labels = self._internal.column_labels |
| |
| cnt = reduce( |
| lambda x, y: x + y, |
| [ |
| F.when(self._psser_for(label).notna().spark.column, 1).otherwise(0) |
| for label in labels |
| ], |
| SF.lit(0), |
| ) |
| if thresh is not None: |
| pred = cnt >= SF.lit(int(thresh)) |
| elif how == "any": |
| pred = cnt == SF.lit(len(labels)) |
| elif how == "all": |
| pred = cnt > SF.lit(0) |
| |
| internal = self._internal.with_filter(pred) |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| else: |
| assert axis == 1 |
| |
| internal = self._internal.resolved_copy |
| |
| if labels is not None: |
| if any(len(lbl) != internal.index_level for lbl in labels): |
| raise ValueError( |
| "The length of each subset must be the same as the index size." |
| ) |
| |
| cond = reduce( |
| lambda x, y: x | y, |
| [ |
| reduce( |
| lambda x, y: x & y, |
| [ |
| scol == SF.lit(l) |
| for l, scol in zip(lbl, internal.index_spark_columns) |
| ], |
| ) |
| for lbl in labels |
| ], |
| ) |
| |
| internal = internal.with_filter(cond) |
| |
| psdf = DataFrame(internal) |
| |
| null_counts = [] |
| for label in internal.column_labels: |
| psser = psdf._psser_for(label) |
| cond = psser.isnull().spark.column |
| null_counts.append( |
| F.sum(F.when(~cond, 1).otherwise(0)).alias(name_like_string(label)) |
| ) |
| |
| counts = internal.spark_frame.select(null_counts + [F.count("*")]).head() |
| |
| if thresh is not None: |
| column_labels = [ |
| label |
| for label, cnt in zip(internal.column_labels, counts) |
| if (cnt or 0) >= int(thresh) |
| ] |
| elif how == "any": |
| column_labels = [ |
| label |
| for label, cnt in zip(internal.column_labels, counts) |
| if (cnt or 0) == counts[-1] |
| ] |
| elif how == "all": |
| column_labels = [ |
| label for label, cnt in zip(internal.column_labels, counts) if (cnt or 0) > 0 |
| ] |
| |
| psdf = self[column_labels] |
| if inplace: |
| self._update_internal_frame(psdf._internal) |
| return None |
| else: |
| return psdf |
| |
| # TODO: add 'limit' when value parameter exists |
| def fillna( |
| self, |
| value: Optional[Union[Any, Dict[Name, Any]]] = None, |
| method: Optional[str] = None, |
| axis: Optional[Axis] = None, |
| inplace: bool = False, |
| limit: Optional[int] = None, |
| ) -> Optional["DataFrame"]: |
| """Fill NA/NaN values. |
| |
| .. note:: the current implementation of 'method' parameter in fillna uses Spark's Window |
| without specifying partition specification. This leads to move all data into |
| single partition in single machine and could cause serious |
| performance degradation. Avoid this method against very large dataset. |
| |
| Parameters |
| ---------- |
| value : scalar, dict, Series |
| Value to use to fill holes. alternately a dict/Series of values |
| specifying which value to use for each column. |
| DataFrame is not supported. |
| method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None |
| Method to use for filling holes in reindexed Series pad / ffill: propagate last valid |
| observation forward to next valid backfill / bfill: |
| use NEXT valid observation to fill gap |
| axis : {0 or `index`} |
| 1 and `columns` are not supported. |
| inplace : boolean, default False |
| Fill in place (do not create a new object) |
| limit : int, default None |
| If method is specified, this is the maximum number of consecutive NaN values to |
| forward/backward fill. In other words, if there is a gap with more than this number of |
| consecutive NaNs, it will only be partially filled. If method is not specified, |
| this is the maximum number of entries along the entire axis where NaNs will be filled. |
| Must be greater than 0 if not None |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with NA entries filled. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({ |
| ... 'A': [None, 3, None, None], |
| ... 'B': [2, 4, None, 3], |
| ... 'C': [None, None, None, 1], |
| ... 'D': [0, 1, 5, 4] |
| ... }, |
| ... columns=['A', 'B', 'C', 'D']) |
| >>> df |
| A B C D |
| 0 NaN 2.0 NaN 0 |
| 1 3.0 4.0 NaN 1 |
| 2 NaN NaN NaN 5 |
| 3 NaN 3.0 1.0 4 |
| |
| Replace all NaN elements with 0s. |
| |
| >>> df.fillna(0) |
| A B C D |
| 0 0.0 2.0 0.0 0 |
| 1 3.0 4.0 0.0 1 |
| 2 0.0 0.0 0.0 5 |
| 3 0.0 3.0 1.0 4 |
| |
| We can also propagate non-null values forward or backward. |
| |
| >>> df.fillna(method='ffill') |
| A B C D |
| 0 NaN 2.0 NaN 0 |
| 1 3.0 4.0 NaN 1 |
| 2 3.0 4.0 NaN 5 |
| 3 3.0 3.0 1.0 4 |
| |
| Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, |
| 2, and 3 respectively. |
| |
| >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} |
| >>> df.fillna(value=values) |
| A B C D |
| 0 0.0 2.0 2.0 0 |
| 1 3.0 4.0 2.0 1 |
| 2 0.0 1.0 2.0 5 |
| 3 0.0 3.0 1.0 4 |
| """ |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError("fillna currently only works for axis=0 or axis='index'") |
| |
| if value is not None: |
| if not isinstance(value, (float, int, str, bool, dict, pd.Series)): |
| raise TypeError("Unsupported type %s" % type(value).__name__) |
| if limit is not None: |
| raise ValueError("limit parameter for value is not support now") |
| if isinstance(value, pd.Series): |
| value = value.to_dict() |
| if isinstance(value, dict): |
| for v in value.values(): |
| if not isinstance(v, (float, int, str, bool)): |
| raise TypeError("Unsupported type %s" % type(v).__name__) |
| value = {k if is_name_like_tuple(k) else (k,): v for k, v in value.items()} |
| |
| def op(psser: ps.Series) -> ps.Series: |
| label = psser._column_label |
| for k, v in value.items(): |
| if k == label[: len(k)]: |
| return psser._fillna( |
| value=value[k], method=method, axis=axis, limit=limit |
| ) |
| else: |
| return psser |
| |
| else: |
| op = lambda psser: psser._fillna(value=value, method=method, axis=axis, limit=limit) |
| elif method is not None: |
| op = lambda psser: psser._fillna(value=value, method=method, axis=axis, limit=limit) |
| else: |
| raise ValueError("Must specify a fillna 'value' or 'method' parameter.") |
| |
| psdf = self._apply_series_op(op, should_resolve=(method is not None)) |
| |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| if inplace: |
| self._update_internal_frame(psdf._internal, requires_same_anchor=False) |
| return None |
| else: |
| return psdf |
| |
| def replace( |
| self, |
| to_replace: Optional[Union[Any, List, Tuple, Dict]] = None, |
| value: Optional[Any] = None, |
| inplace: bool = False, |
| limit: Optional[int] = None, |
| regex: bool = False, |
| method: str = "pad", |
| ) -> Optional["DataFrame"]: |
| """ |
| Returns a new DataFrame replacing a value with another value. |
| |
| Parameters |
| ---------- |
| to_replace : int, float, string, list, tuple or dict |
| Value to be replaced. |
| value : int, float, string, list or tuple |
| Value to use to replace holes. The replacement value must be an int, float, |
| or string. |
| If value is a list or tuple, value should be of the same length with to_replace. |
| inplace : boolean, default False |
| Fill in place (do not create a new object) |
| |
| Returns |
| ------- |
| DataFrame |
| Object after replacement. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({"name": ['Ironman', 'Captain America', 'Thor', 'Hulk'], |
| ... "weapon": ['Mark-45', 'Shield', 'Mjolnir', 'Smash']}, |
| ... columns=['name', 'weapon']) |
| >>> df |
| name weapon |
| 0 Ironman Mark-45 |
| 1 Captain America Shield |
| 2 Thor Mjolnir |
| 3 Hulk Smash |
| |
| Scalar `to_replace` and `value` |
| |
| >>> df.replace('Ironman', 'War-Machine') |
| name weapon |
| 0 War-Machine Mark-45 |
| 1 Captain America Shield |
| 2 Thor Mjolnir |
| 3 Hulk Smash |
| |
| List like `to_replace` and `value` |
| |
| >>> df.replace(['Ironman', 'Captain America'], ['Rescue', 'Hawkeye'], inplace=True) |
| >>> df |
| name weapon |
| 0 Rescue Mark-45 |
| 1 Hawkeye Shield |
| 2 Thor Mjolnir |
| 3 Hulk Smash |
| |
| Dicts can be used to specify different replacement values for different existing values |
| To use a dict in this way the value parameter should be None |
| |
| >>> df.replace({'Mjolnir': 'Stormbuster'}) |
| name weapon |
| 0 Rescue Mark-45 |
| 1 Hawkeye Shield |
| 2 Thor Stormbuster |
| 3 Hulk Smash |
| |
| Dict can specify that different values should be replaced in different columns |
| The value parameter should not be None in this case |
| |
| >>> df.replace({'weapon': 'Mjolnir'}, 'Stormbuster') |
| name weapon |
| 0 Rescue Mark-45 |
| 1 Hawkeye Shield |
| 2 Thor Stormbuster |
| 3 Hulk Smash |
| |
| Nested dictionaries |
| The value parameter should be None to use a nested dict in this way |
| |
| >>> df.replace({'weapon': {'Mjolnir': 'Stormbuster'}}) |
| name weapon |
| 0 Rescue Mark-45 |
| 1 Hawkeye Shield |
| 2 Thor Stormbuster |
| 3 Hulk Smash |
| """ |
| if method != "pad": |
| raise NotImplementedError("replace currently works only for method='pad") |
| if limit is not None: |
| raise NotImplementedError("replace currently works only when limit=None") |
| if regex is not False: |
| raise NotImplementedError("replace currently doesn't supports regex") |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| |
| if value is not None and not isinstance(value, (int, float, str, list, tuple, dict)): |
| raise TypeError("Unsupported type {}".format(type(value).__name__)) |
| if to_replace is not None and not isinstance( |
| to_replace, (int, float, str, list, tuple, dict) |
| ): |
| raise TypeError("Unsupported type {}".format(type(to_replace).__name__)) |
| |
| if isinstance(value, (list, tuple)) and isinstance(to_replace, (list, tuple)): |
| if len(value) != len(to_replace): |
| raise ValueError("Length of to_replace and value must be same") |
| |
| if isinstance(to_replace, dict) and ( |
| value is not None or all(isinstance(i, dict) for i in to_replace.values()) |
| ): |
| to_replace_dict = cast(dict, to_replace) |
| |
| def op(psser: ps.Series) -> ps.Series: |
| if psser.name in to_replace_dict: |
| return psser.replace( |
| to_replace=to_replace_dict[psser.name], value=value, regex=regex |
| ) |
| else: |
| return psser |
| |
| else: |
| op = lambda psser: psser.replace(to_replace=to_replace, value=value, regex=regex) |
| |
| psdf = self._apply_series_op(op) |
| if inplace: |
| self._update_internal_frame(psdf._internal) |
| return None |
| else: |
| return psdf |
| |
| def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None) -> "DataFrame": |
| """ |
| Trim values at input threshold(s). |
| |
| Assigns values outside boundary to boundary values. |
| |
| Parameters |
| ---------- |
| lower : float or int, default None |
| Minimum threshold value. All values below this threshold will be set to it. |
| upper : float or int, default None |
| Maximum threshold value. All values above this threshold will be set to it. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with the values outside the clip boundaries replaced. |
| |
| Examples |
| -------- |
| >>> ps.DataFrame({'A': [0, 2, 4]}).clip(1, 3) |
| A |
| 0 1 |
| 1 2 |
| 2 3 |
| |
| Notes |
| ----- |
| One difference between this implementation and pandas is that running |
| pd.DataFrame({'A': ['a', 'b']}).clip(0, 1) will crash with "TypeError: '<=' not supported |
| between instances of 'str' and 'int'" while ps.DataFrame({'A': ['a', 'b']}).clip(0, 1) |
| will output the original DataFrame, simply ignoring the incompatible types. |
| """ |
| if is_list_like(lower) or is_list_like(upper): |
| raise TypeError( |
| "List-like value are not supported for 'lower' and 'upper' at the " + "moment" |
| ) |
| |
| if lower is None and upper is None: |
| return self |
| |
| return self._apply_series_op(lambda psser: psser.clip(lower=lower, upper=upper)) |
| |
| def head(self, n: int = 5) -> "DataFrame": |
| """ |
| Return the first `n` rows. |
| |
| This function returns the first `n` rows for the object based |
| on position. It is useful for quickly testing if your object |
| has the right type of data in it. |
| |
| Parameters |
| ---------- |
| n : int, default 5 |
| Number of rows to select. |
| |
| Returns |
| ------- |
| obj_head : same type as caller |
| The first `n` rows of the caller object. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', |
| ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) |
| >>> df |
| animal |
| 0 alligator |
| 1 bee |
| 2 falcon |
| 3 lion |
| 4 monkey |
| 5 parrot |
| 6 shark |
| 7 whale |
| 8 zebra |
| |
| Viewing the first 5 lines |
| |
| >>> df.head() |
| animal |
| 0 alligator |
| 1 bee |
| 2 falcon |
| 3 lion |
| 4 monkey |
| |
| Viewing the first `n` lines (three in this case) |
| |
| >>> df.head(3) |
| animal |
| 0 alligator |
| 1 bee |
| 2 falcon |
| """ |
| if n < 0: |
| n = len(self) + n |
| if n <= 0: |
| return DataFrame(self._internal.with_filter(SF.lit(False))) |
| else: |
| sdf = self._internal.resolved_copy.spark_frame |
| if get_option("compute.ordered_head"): |
| sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME) |
| return DataFrame(self._internal.with_new_sdf(sdf.limit(n))) |
| |
| def last(self, offset: Union[str, DateOffset]) -> "DataFrame": |
| """ |
| Select final periods of time series data based on a date offset. |
| |
| When having a DataFrame with dates as index, this function can |
| select the last few rows based on a date offset. |
| |
| Parameters |
| ---------- |
| offset : str or DateOffset |
| The offset length of the data that will be selected. For instance, |
| '3D' will display all the rows having their index within the last 3 days. |
| |
| Returns |
| ------- |
| DataFrame |
| A subset of the caller. |
| |
| Raises |
| ------ |
| TypeError |
| If the index is not a :class:`DatetimeIndex` |
| |
| Examples |
| -------- |
| |
| >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') |
| >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index) |
| >>> psdf |
| A |
| 2018-04-09 1 |
| 2018-04-11 2 |
| 2018-04-13 3 |
| 2018-04-15 4 |
| |
| Get the rows for the last 3 days: |
| |
| >>> psdf.last('3D') |
| A |
| 2018-04-13 3 |
| 2018-04-15 4 |
| |
| Notice the data for 3 last calendar days were returned, not the last |
| 3 observed days in the dataset, and therefore data for 2018-04-11 was |
| not returned. |
| """ |
| # Check index type should be format DateTime |
| if not isinstance(self.index, ps.DatetimeIndex): |
| raise TypeError("'last' only supports a DatetimeIndex") |
| |
| offset = to_offset(offset) |
| from_date = self.index.max() - offset |
| |
| return cast(DataFrame, self.loc[from_date:]) |
| |
| def first(self, offset: Union[str, DateOffset]) -> "DataFrame": |
| """ |
| Select first periods of time series data based on a date offset. |
| |
| When having a DataFrame with dates as index, this function can |
| select the first few rows based on a date offset. |
| |
| Parameters |
| ---------- |
| offset : str or DateOffset |
| The offset length of the data that will be selected. For instance, |
| '3D' will display all the rows having their index within the first 3 days. |
| |
| Returns |
| ------- |
| DataFrame |
| A subset of the caller. |
| |
| Raises |
| ------ |
| TypeError |
| If the index is not a :class:`DatetimeIndex` |
| |
| Examples |
| -------- |
| |
| >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') |
| >>> psdf = ps.DataFrame({'A': [1, 2, 3, 4]}, index=index) |
| >>> psdf |
| A |
| 2018-04-09 1 |
| 2018-04-11 2 |
| 2018-04-13 3 |
| 2018-04-15 4 |
| |
| Get the rows for the last 3 days: |
| |
| >>> psdf.first('3D') |
| A |
| 2018-04-09 1 |
| 2018-04-11 2 |
| |
| Notice the data for 3 first calendar days were returned, not the first |
| 3 observed days in the dataset, and therefore data for 2018-04-13 was |
| not returned. |
| """ |
| # Check index type should be format DatetimeIndex |
| if not isinstance(self.index, ps.DatetimeIndex): |
| raise TypeError("'first' only supports a DatetimeIndex") |
| |
| offset = to_offset(offset) |
| to_date = self.index.min() + offset |
| |
| return cast(DataFrame, self.loc[:to_date]) |
| |
| def pivot_table( |
| self, |
| values: Optional[Union[Name, List[Name]]] = None, |
| index: Optional[List[Name]] = None, |
| columns: Optional[Name] = None, |
| aggfunc: Union[str, Dict[Name, str]] = "mean", |
| fill_value: Optional[Any] = None, |
| ) -> "DataFrame": |
| """ |
| Create a spreadsheet-style pivot table as a DataFrame. The levels in |
| the pivot table will be stored in MultiIndex objects (hierarchical |
| indexes) on the index and columns of the result DataFrame. |
| |
| Parameters |
| ---------- |
| values : column to aggregate. |
| They should be either a list less than three or a string. |
| index : column (string) or list of columns |
| If an array is passed, it must be the same length as the data. |
| The list should contain string. |
| columns : column |
| Columns used in the pivot operation. Only one column is supported and |
| it should be a string. |
| aggfunc : function (string), dict, default mean |
| If dict is passed, the key is column to aggregate and value |
| is function or list of functions. |
| fill_value : scalar, default None |
| Value to replace missing values with. |
| |
| Returns |
| ------- |
| table : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", |
| ... "bar", "bar", "bar", "bar"], |
| ... "B": ["one", "one", "one", "two", "two", |
| ... "one", "one", "two", "two"], |
| ... "C": ["small", "large", "large", "small", |
| ... "small", "large", "small", "small", |
| ... "large"], |
| ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], |
| ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}, |
| ... columns=['A', 'B', 'C', 'D', 'E']) |
| >>> df |
| A B C D E |
| 0 foo one small 1 2 |
| 1 foo one large 2 4 |
| 2 foo one large 2 5 |
| 3 foo two small 3 5 |
| 4 foo two small 3 6 |
| 5 bar one large 4 6 |
| 6 bar one small 5 8 |
| 7 bar two small 6 9 |
| 8 bar two large 7 9 |
| |
| This first example aggregates values by taking the sum. |
| |
| >>> table = df.pivot_table(values='D', index=['A', 'B'], |
| ... columns='C', aggfunc='sum') |
| >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE |
| C large small |
| A B |
| bar one 4.0 5 |
| two 7.0 6 |
| foo one 4.0 1 |
| two NaN 6 |
| |
| We can also fill missing values using the `fill_value` parameter. |
| |
| >>> table = df.pivot_table(values='D', index=['A', 'B'], |
| ... columns='C', aggfunc='sum', fill_value=0) |
| >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE |
| C large small |
| A B |
| bar one 4 5 |
| two 7 6 |
| foo one 4 1 |
| two 0 6 |
| |
| We can also calculate multiple types of aggregations for any given |
| value column. |
| |
| >>> table = df.pivot_table(values=['D'], index =['C'], |
| ... columns="A", aggfunc={'D': 'mean'}) |
| >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE |
| D |
| A bar foo |
| C |
| large 5.5 2.000000 |
| small 5.5 2.333333 |
| |
| The next example aggregates on multiple values. |
| |
| >>> table = df.pivot_table(index=['C'], columns="A", values=['D', 'E'], |
| ... aggfunc={'D': 'mean', 'E': 'sum'}) |
| >>> table.sort_index() # doctest: +NORMALIZE_WHITESPACE |
| D E |
| A bar foo bar foo |
| C |
| large 5.5 2.000000 15 9 |
| small 5.5 2.333333 17 13 |
| """ |
| if not is_name_like_value(columns): |
| raise TypeError("columns should be one column name.") |
| |
| if not is_name_like_value(values) and not ( |
| isinstance(values, list) and all(is_name_like_value(v) for v in values) |
| ): |
| raise TypeError("values should be one column or list of columns.") |
| |
| if not isinstance(aggfunc, str) and ( |
| not isinstance(aggfunc, dict) |
| or not all( |
| is_name_like_value(key) and isinstance(value, str) for key, value in aggfunc.items() |
| ) |
| ): |
| raise TypeError( |
| "aggfunc must be a dict mapping from column name " |
| "to aggregate functions (string)." |
| ) |
| |
| if isinstance(aggfunc, dict) and index is None: |
| raise NotImplementedError( |
| "pivot_table doesn't support aggfunc" " as dict and without index." |
| ) |
| if isinstance(values, list) and index is None: |
| raise NotImplementedError("values can't be a list without index.") |
| |
| if columns not in self.columns: |
| raise ValueError("Wrong columns {}.".format(name_like_string(columns))) |
| if not is_name_like_tuple(columns): |
| columns = (columns,) |
| |
| if isinstance(values, list): |
| values = [col if is_name_like_tuple(col) else (col,) for col in values] |
| if not all( |
| isinstance(self._internal.spark_type_for(col), NumericType) for col in values |
| ): |
| raise TypeError("values should be a numeric type.") |
| else: |
| values = values if is_name_like_tuple(values) else (values,) |
| if not isinstance(self._internal.spark_type_for(values), NumericType): |
| raise TypeError("values should be a numeric type.") |
| |
| if isinstance(aggfunc, str): |
| if isinstance(values, list): |
| agg_cols = [ |
| F.expr( |
| "{1}(`{0}`) as `{0}`".format( |
| self._internal.spark_column_name_for(value), aggfunc |
| ) |
| ) |
| for value in values |
| ] |
| else: |
| agg_cols = [ |
| F.expr( |
| "{1}(`{0}`) as `{0}`".format( |
| self._internal.spark_column_name_for(values), aggfunc |
| ) |
| ) |
| ] |
| elif isinstance(aggfunc, dict): |
| aggfunc = { |
| key if is_name_like_tuple(key) else (key,): value for key, value in aggfunc.items() |
| } |
| agg_cols = [ |
| F.expr( |
| "{1}(`{0}`) as `{0}`".format(self._internal.spark_column_name_for(key), value) |
| ) |
| for key, value in aggfunc.items() |
| ] |
| agg_columns = [key for key, _ in aggfunc.items()] |
| |
| if set(agg_columns) != set(values): |
| raise ValueError("Columns in aggfunc must be the same as values.") |
| |
| sdf = self._internal.resolved_copy.spark_frame |
| if index is None: |
| sdf = ( |
| sdf.groupBy() |
| .pivot(pivot_col=self._internal.spark_column_name_for(columns)) |
| .agg(*agg_cols) |
| ) |
| |
| elif isinstance(index, list): |
| index = [label if is_name_like_tuple(label) else (label,) for label in index] |
| sdf = ( |
| sdf.groupBy([self._internal.spark_column_name_for(label) for label in index]) |
| .pivot(pivot_col=self._internal.spark_column_name_for(columns)) |
| .agg(*agg_cols) |
| ) |
| else: |
| raise TypeError("index should be a None or a list of columns.") |
| |
| if fill_value is not None and isinstance(fill_value, (int, float)): |
| sdf = sdf.fillna(fill_value) |
| |
| if index is not None: |
| index_columns = [self._internal.spark_column_name_for(label) for label in index] |
| index_fields = [self._internal.field_for(label) for label in index] |
| |
| if isinstance(values, list): |
| data_columns = [column for column in sdf.columns if column not in index_columns] |
| |
| if len(values) > 1: |
| # If we have two values, Spark will return column's name |
| # in this format: column_values, where column contains |
| # their values in the DataFrame and values is |
| # the column list passed to the pivot_table(). |
| # E.g. if column is b and values is ['b','e'], |
| # then ['2_b', '2_e', '3_b', '3_e']. |
| |
| # We sort the columns of Spark DataFrame by values. |
| data_columns.sort(key=lambda x: x.split("_", 1)[1]) |
| sdf = sdf.select(index_columns + data_columns) |
| |
| column_name_to_index = dict( |
| zip(self._internal.data_spark_column_names, self._internal.column_labels) |
| ) |
| column_labels = [ |
| tuple(list(column_name_to_index[name.split("_")[1]]) + [name.split("_")[0]]) |
| for name in data_columns |
| ] |
| column_label_names = ( |
| [cast(Optional[Name], None)] * column_labels_level(values) |
| ) + [columns] |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, col) for col in index_columns], |
| index_names=index, |
| index_fields=index_fields, |
| column_labels=column_labels, |
| data_spark_columns=[scol_for(sdf, col) for col in data_columns], |
| column_label_names=column_label_names, |
| ) |
| psdf = DataFrame(internal) # type: "DataFrame" |
| else: |
| column_labels = [tuple(list(values[0]) + [column]) for column in data_columns] |
| column_label_names = ([cast(Optional[Name], None)] * len(values[0])) + [columns] |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, col) for col in index_columns], |
| index_names=index, |
| index_fields=index_fields, |
| column_labels=column_labels, |
| data_spark_columns=[scol_for(sdf, col) for col in data_columns], |
| column_label_names=column_label_names, |
| ) |
| psdf = DataFrame(internal) |
| else: |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, col) for col in index_columns], |
| index_names=index, |
| index_fields=index_fields, |
| column_label_names=[columns], |
| ) |
| psdf = DataFrame(internal) |
| else: |
| if isinstance(values, list): |
| index_values = values[-1] |
| else: |
| index_values = values |
| index_map = OrderedDict() # type: Dict[str, Optional[Label]] |
| for i, index_value in enumerate(index_values): |
| colname = SPARK_INDEX_NAME_FORMAT(i) |
| sdf = sdf.withColumn(colname, SF.lit(index_value)) |
| index_map[colname] = None |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, col) for col in index_map.keys()], |
| index_names=list(index_map.values()), |
| column_label_names=[columns], |
| ) |
| psdf = DataFrame(internal) |
| |
| psdf_columns = psdf.columns |
| if isinstance(psdf_columns, pd.MultiIndex): |
| psdf.columns = psdf_columns.set_levels( |
| psdf_columns.levels[-1].astype( |
| spark_type_to_pandas_dtype(self._psser_for(columns).spark.data_type) |
| ), |
| level=-1, |
| ) |
| else: |
| psdf.columns = psdf_columns.astype( |
| spark_type_to_pandas_dtype(self._psser_for(columns).spark.data_type) |
| ) |
| |
| return psdf |
| |
| def pivot( |
| self, |
| index: Optional[Name] = None, |
| columns: Optional[Name] = None, |
| values: Optional[Name] = None, |
| ) -> "DataFrame": |
| """ |
| Return reshaped DataFrame organized by given index / column values. |
| |
| Reshape data (produce a "pivot" table) based on column values. Uses |
| unique values from specified `index` / `columns` to form axes of the |
| resulting DataFrame. This function does not support data |
| aggregation. |
| |
| Parameters |
| ---------- |
| index : string, optional |
| Column to use to make new frame's index. If None, uses |
| existing index. |
| columns : string |
| Column to use to make new frame's columns. |
| values : string, object or a list of the previous |
| Column(s) to use for populating new frame's values. |
| |
| Returns |
| ------- |
| DataFrame |
| Returns reshaped DataFrame. |
| |
| See Also |
| -------- |
| DataFrame.pivot_table : Generalization of pivot that can handle |
| duplicate values for one index/column pair. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', |
| ... 'two'], |
| ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], |
| ... 'baz': [1, 2, 3, 4, 5, 6], |
| ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}, |
| ... columns=['foo', 'bar', 'baz', 'zoo']) |
| >>> df |
| foo bar baz zoo |
| 0 one A 1 x |
| 1 one B 2 y |
| 2 one C 3 z |
| 3 two A 4 q |
| 4 two B 5 w |
| 5 two C 6 t |
| |
| >>> df.pivot(index='foo', columns='bar', values='baz').sort_index() |
| ... # doctest: +NORMALIZE_WHITESPACE |
| bar A B C |
| foo |
| one 1 2 3 |
| two 4 5 6 |
| |
| >>> df.pivot(columns='bar', values='baz').sort_index() # doctest: +NORMALIZE_WHITESPACE |
| bar A B C |
| 0 1.0 NaN NaN |
| 1 NaN 2.0 NaN |
| 2 NaN NaN 3.0 |
| 3 4.0 NaN NaN |
| 4 NaN 5.0 NaN |
| 5 NaN NaN 6.0 |
| |
| Notice that, unlike pandas raises an ValueError when duplicated values are found, |
| pandas-on-Spark's pivot still works with its first value it meets during operation because |
| pivot is an expensive operation and it is preferred to permissively execute over failing |
| fast when processing large data. |
| |
| >>> df = ps.DataFrame({"foo": ['one', 'one', 'two', 'two'], |
| ... "bar": ['A', 'A', 'B', 'C'], |
| ... "baz": [1, 2, 3, 4]}, columns=['foo', 'bar', 'baz']) |
| >>> df |
| foo bar baz |
| 0 one A 1 |
| 1 one A 2 |
| 2 two B 3 |
| 3 two C 4 |
| |
| >>> df.pivot(index='foo', columns='bar', values='baz').sort_index() |
| ... # doctest: +NORMALIZE_WHITESPACE |
| bar A B C |
| foo |
| one 1.0 NaN NaN |
| two NaN 3.0 4.0 |
| |
| It also support multi-index and multi-index column. |
| >>> df.columns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'baz')]) |
| |
| >>> df = df.set_index(('a', 'bar'), append=True) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| a b |
| foo baz |
| (a, bar) |
| 0 A one 1 |
| 1 A one 2 |
| 2 B two 3 |
| 3 C two 4 |
| |
| >>> df.pivot(columns=('a', 'foo'), values=('b', 'baz')).sort_index() |
| ... # doctest: +NORMALIZE_WHITESPACE |
| ('a', 'foo') one two |
| (a, bar) |
| 0 A 1.0 NaN |
| 1 A 2.0 NaN |
| 2 B NaN 3.0 |
| 3 C NaN 4.0 |
| |
| """ |
| if columns is None: |
| raise ValueError("columns should be set.") |
| |
| if values is None: |
| raise ValueError("values should be set.") |
| |
| should_use_existing_index = index is not None |
| if should_use_existing_index: |
| df = self |
| index_labels = [index] |
| else: |
| # The index after `reset_index()` will never be used, so use "distributed" index |
| # as a dummy to avoid overhead. |
| with option_context("compute.default_index_type", "distributed"): |
| df = self.reset_index() |
| index_labels = df._internal.column_labels[: self._internal.index_level] |
| |
| df = df.pivot_table(index=index_labels, columns=columns, values=values, aggfunc="first") |
| |
| if should_use_existing_index: |
| return df |
| else: |
| internal = df._internal.copy(index_names=self._internal.index_names) |
| return DataFrame(internal) |
| |
| @property |
| def columns(self) -> pd.Index: |
| """The column labels of the DataFrame.""" |
| names = [ |
| name if name is None or len(name) > 1 else name[0] |
| for name in self._internal.column_label_names |
| ] |
| if self._internal.column_labels_level > 1: |
| columns = pd.MultiIndex.from_tuples(self._internal.column_labels, names=names) |
| else: |
| columns = pd.Index([label[0] for label in self._internal.column_labels], name=names[0]) |
| return columns |
| |
| @columns.setter |
| def columns(self, columns: Union[pd.Index, List[Name]]) -> None: |
| if isinstance(columns, pd.MultiIndex): |
| column_labels = columns.tolist() |
| else: |
| column_labels = [ |
| col if is_name_like_tuple(col, allow_none=False) else (col,) for col in columns |
| ] |
| |
| if len(self._internal.column_labels) != len(column_labels): |
| raise ValueError( |
| "Length mismatch: Expected axis has {} elements, " |
| "new values have {} elements".format( |
| len(self._internal.column_labels), len(column_labels) |
| ) |
| ) |
| |
| if isinstance(columns, pd.Index): |
| column_label_names = [ |
| name if is_name_like_tuple(name) else (name,) for name in columns.names |
| ] # type: Optional[List] |
| else: |
| column_label_names = None |
| |
| pssers = [ |
| self._psser_for(label).rename(name) |
| for label, name in zip(self._internal.column_labels, column_labels) |
| ] |
| self._update_internal_frame( |
| self._internal.with_new_columns(pssers, column_label_names=column_label_names) |
| ) |
| |
| @property |
| def dtypes(self) -> pd.Series: |
| """Return the dtypes in the DataFrame. |
| |
| This returns a Series with the data type of each column. The result's index is the original |
| DataFrame's columns. Columns with mixed types are stored with the object dtype. |
| |
| Returns |
| ------- |
| pd.Series |
| The data type of each column. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': list('abc'), |
| ... 'b': list(range(1, 4)), |
| ... 'c': np.arange(3, 6).astype('i1'), |
| ... 'd': np.arange(4.0, 7.0, dtype='float64'), |
| ... 'e': [True, False, True], |
| ... 'f': pd.date_range('20130101', periods=3)}, |
| ... columns=['a', 'b', 'c', 'd', 'e', 'f']) |
| >>> df.dtypes |
| a object |
| b int64 |
| c int8 |
| d float64 |
| e bool |
| f datetime64[ns] |
| dtype: object |
| """ |
| return pd.Series( |
| [self._psser_for(label).dtype for label in self._internal.column_labels], |
| index=pd.Index( |
| [label if len(label) > 1 else label[0] for label in self._internal.column_labels] |
| ), |
| ) |
| |
| def select_dtypes( |
| self, |
| include: Optional[Union[str, List[str]]] = None, |
| exclude: Optional[Union[str, List[str]]] = None, |
| ) -> "DataFrame": |
| """ |
| Return a subset of the DataFrame's columns based on the column dtypes. |
| |
| Parameters |
| ---------- |
| include, exclude : scalar or list-like |
| A selection of dtypes or strings to be included/excluded. At least |
| one of these parameters must be supplied. It also takes Spark SQL |
| DDL type strings, for instance, 'string' and 'date'. |
| |
| Returns |
| ------- |
| DataFrame |
| The subset of the frame including the dtypes in ``include`` and |
| excluding the dtypes in ``exclude``. |
| |
| Raises |
| ------ |
| ValueError |
| * If both of ``include`` and ``exclude`` are empty |
| |
| >>> df = ps.DataFrame({'a': [1, 2] * 3, |
| ... 'b': [True, False] * 3, |
| ... 'c': [1.0, 2.0] * 3}) |
| >>> df.select_dtypes() |
| Traceback (most recent call last): |
| ... |
| ValueError: at least one of include or exclude must be nonempty |
| |
| * If ``include`` and ``exclude`` have overlapping elements |
| |
| >>> df = ps.DataFrame({'a': [1, 2] * 3, |
| ... 'b': [True, False] * 3, |
| ... 'c': [1.0, 2.0] * 3}) |
| >>> df.select_dtypes(include='a', exclude='a') |
| Traceback (most recent call last): |
| ... |
| ValueError: include and exclude overlap on {'a'} |
| |
| Notes |
| ----- |
| * To select datetimes, use ``np.datetime64``, ``'datetime'`` or |
| ``'datetime64'`` |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': [1, 2] * 3, |
| ... 'b': [True, False] * 3, |
| ... 'c': [1.0, 2.0] * 3, |
| ... 'd': ['a', 'b'] * 3}, columns=['a', 'b', 'c', 'd']) |
| >>> df |
| a b c d |
| 0 1 True 1.0 a |
| 1 2 False 2.0 b |
| 2 1 True 1.0 a |
| 3 2 False 2.0 b |
| 4 1 True 1.0 a |
| 5 2 False 2.0 b |
| |
| >>> df.select_dtypes(include='bool') |
| b |
| 0 True |
| 1 False |
| 2 True |
| 3 False |
| 4 True |
| 5 False |
| |
| >>> df.select_dtypes(include=['float64'], exclude=['int']) |
| c |
| 0 1.0 |
| 1 2.0 |
| 2 1.0 |
| 3 2.0 |
| 4 1.0 |
| 5 2.0 |
| |
| >>> df.select_dtypes(exclude=['int']) |
| b c d |
| 0 True 1.0 a |
| 1 False 2.0 b |
| 2 True 1.0 a |
| 3 False 2.0 b |
| 4 True 1.0 a |
| 5 False 2.0 b |
| |
| Spark SQL DDL type strings can be used as well. |
| |
| >>> df.select_dtypes(exclude=['string']) |
| a b c |
| 0 1 True 1.0 |
| 1 2 False 2.0 |
| 2 1 True 1.0 |
| 3 2 False 2.0 |
| 4 1 True 1.0 |
| 5 2 False 2.0 |
| """ |
| from pyspark.sql.types import _parse_datatype_string # type: ignore |
| |
| if not is_list_like(include): |
| include_list = [include] if include is not None else [] |
| else: |
| include_list = list(include) |
| if not is_list_like(exclude): |
| exclude_list = [exclude] if exclude is not None else [] |
| else: |
| exclude_list = list(exclude) |
| |
| if not any((include_list, exclude_list)): |
| raise ValueError("at least one of include or exclude must be " "nonempty") |
| |
| # can't both include AND exclude! |
| if set(include_list).intersection(set(exclude_list)): |
| raise ValueError( |
| "include and exclude overlap on {inc_ex}".format( |
| inc_ex=set(include_list).intersection(set(exclude_list)) |
| ) |
| ) |
| |
| # Handle Spark types |
| include_spark_type = [] |
| for inc in include_list: |
| try: |
| include_spark_type.append(_parse_datatype_string(inc)) |
| except: |
| pass |
| |
| exclude_spark_type = [] |
| for exc in exclude_list: |
| try: |
| exclude_spark_type.append(_parse_datatype_string(exc)) |
| except: |
| pass |
| |
| # Handle pandas types |
| include_numpy_type = [] |
| for inc in include_list: |
| try: |
| include_numpy_type.append(infer_dtype_from_object(inc)) |
| except: |
| pass |
| |
| exclude_numpy_type = [] |
| for exc in exclude_list: |
| try: |
| exclude_numpy_type.append(infer_dtype_from_object(exc)) |
| except: |
| pass |
| |
| column_labels = [] |
| for label in self._internal.column_labels: |
| if len(include_list) > 0: |
| should_include = ( |
| infer_dtype_from_object(self._psser_for(label).dtype.name) in include_numpy_type |
| or self._internal.spark_type_for(label) in include_spark_type |
| ) |
| else: |
| should_include = not ( |
| infer_dtype_from_object(self._psser_for(label).dtype.name) in exclude_numpy_type |
| or self._internal.spark_type_for(label) in exclude_spark_type |
| ) |
| |
| if should_include: |
| column_labels.append(label) |
| |
| return DataFrame( |
| self._internal.with_new_columns([self._psser_for(label) for label in column_labels]) |
| ) |
| |
| def droplevel( |
| self, level: Union[int, Name, List[Union[int, Name]]], axis: Axis = 0 |
| ) -> "DataFrame": |
| """ |
| Return DataFrame with requested index / column level(s) removed. |
| |
| Parameters |
| ---------- |
| level: int, str, or list-like |
| If a string is given, must be the name of a level If list-like, elements must |
| be names or positional indexes of levels. |
| |
| axis: {0 or ‘index’, 1 or ‘columns’}, default 0 |
| |
| Returns |
| ------- |
| DataFrame with requested index / column level(s) removed. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame( |
| ... [[3, 4], [7, 8], [11, 12]], |
| ... index=pd.MultiIndex.from_tuples([(1, 2), (5, 6), (9, 10)], names=["a", "b"]), |
| ... ) |
| |
| >>> df.columns = pd.MultiIndex.from_tuples([ |
| ... ('c', 'e'), ('d', 'f') |
| ... ], names=['level_1', 'level_2']) |
| |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| level_1 c d |
| level_2 e f |
| a b |
| 1 2 3 4 |
| 5 6 7 8 |
| 9 10 11 12 |
| |
| >>> df.droplevel('a') # doctest: +NORMALIZE_WHITESPACE |
| level_1 c d |
| level_2 e f |
| b |
| 2 3 4 |
| 6 7 8 |
| 10 11 12 |
| |
| >>> df.droplevel('level_2', axis=1) # doctest: +NORMALIZE_WHITESPACE |
| level_1 c d |
| a b |
| 1 2 3 4 |
| 5 6 7 8 |
| 9 10 11 12 |
| """ |
| axis = validate_axis(axis) |
| if axis == 0: |
| if not isinstance(level, (tuple, list)): # huh? |
| level = [level] |
| |
| names = self.index.names |
| nlevels = self._internal.index_level |
| |
| int_level = set() |
| for n in level: |
| if isinstance(n, int): |
| if n < 0: |
| n = n + nlevels |
| if n < 0: |
| raise IndexError( |
| "Too many levels: Index has only {} levels, " |
| "{} is not a valid level number".format(nlevels, (n - nlevels)) |
| ) |
| if n >= nlevels: |
| raise IndexError( |
| "Too many levels: Index has only {} levels, not {}".format( |
| nlevels, (n + 1) |
| ) |
| ) |
| else: |
| if n not in names: |
| raise KeyError("Level {} not found".format(n)) |
| n = names.index(n) |
| int_level.add(n) |
| |
| if len(level) >= nlevels: |
| raise ValueError( |
| "Cannot remove {} levels from an index with {} levels: " |
| "at least one level must be left.".format(len(level), nlevels) |
| ) |
| |
| index_spark_columns, index_names, index_fields = zip( |
| *[ |
| item |
| for i, item in enumerate( |
| zip( |
| self._internal.index_spark_columns, |
| self._internal.index_names, |
| self._internal.index_fields, |
| ) |
| ) |
| if i not in int_level |
| ] |
| ) |
| |
| internal = self._internal.copy( |
| index_spark_columns=list(index_spark_columns), |
| index_names=list(index_names), |
| index_fields=list(index_fields), |
| ) |
| return DataFrame(internal) |
| else: |
| psdf = self.copy() |
| psdf.columns = psdf.columns.droplevel(level) |
| return psdf |
| |
| def drop( |
| self, |
| labels: Optional[Union[Name, List[Name]]] = None, |
| axis: Axis = 1, |
| columns: Union[Name, List[Name]] = None, |
| ) -> "DataFrame": |
| """ |
| Drop specified labels from columns. |
| |
| Remove columns by specifying label names and axis=1 or columns. |
| When specifying both labels and columns, only labels will be dropped. |
| Removing rows is yet to be implemented. |
| |
| Parameters |
| ---------- |
| labels : single label or list-like |
| Column labels to drop. |
| axis : {1 or 'columns'}, default 1 |
| .. dropna currently only works for axis=1 'columns' |
| axis=0 is yet to be implemented. |
| columns : single label or list-like |
| Alternative to specifying axis (``labels, axis=1`` |
| is equivalent to ``columns=labels``). |
| |
| Returns |
| ------- |
| dropped : DataFrame |
| |
| See Also |
| -------- |
| Series.dropna |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]}, |
| ... columns=['x', 'y', 'z', 'w']) |
| >>> df |
| x y z w |
| 0 1 3 5 7 |
| 1 2 4 6 8 |
| |
| >>> df.drop('x', axis=1) |
| y z w |
| 0 3 5 7 |
| 1 4 6 8 |
| |
| >>> df.drop(['y', 'z'], axis=1) |
| x w |
| 0 1 7 |
| 1 2 8 |
| |
| >>> df.drop(columns=['y', 'z']) |
| x w |
| 0 1 7 |
| 1 2 8 |
| |
| Also support for MultiIndex |
| |
| >>> df = ps.DataFrame({'x': [1, 2], 'y': [3, 4], 'z': [5, 6], 'w': [7, 8]}, |
| ... columns=['x', 'y', 'z', 'w']) |
| >>> columns = [('a', 'x'), ('a', 'y'), ('b', 'z'), ('b', 'w')] |
| >>> df.columns = pd.MultiIndex.from_tuples(columns) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| a b |
| x y z w |
| 0 1 3 5 7 |
| 1 2 4 6 8 |
| >>> df.drop('a') # doctest: +NORMALIZE_WHITESPACE |
| b |
| z w |
| 0 5 7 |
| 1 6 8 |
| |
| Notes |
| ----- |
| Currently only axis = 1 is supported in this function, |
| axis = 0 is yet to be implemented. |
| """ |
| if labels is not None: |
| axis = validate_axis(axis) |
| if axis == 1: |
| return self.drop(columns=labels) |
| raise NotImplementedError("Drop currently only works for axis=1") |
| elif columns is not None: |
| if is_name_like_tuple(columns): |
| columns = [columns] |
| elif is_name_like_value(columns): |
| columns = [(columns,)] |
| else: |
| columns = [col if is_name_like_tuple(col) else (col,) for col in columns] |
| drop_column_labels = set( |
| label |
| for label in self._internal.column_labels |
| for col in columns |
| if label[: len(col)] == col |
| ) |
| if len(drop_column_labels) == 0: |
| raise KeyError(columns) |
| cols, labels = zip( |
| *( |
| (column, label) |
| for column, label in zip( |
| self._internal.data_spark_column_names, self._internal.column_labels |
| ) |
| if label not in drop_column_labels |
| ) |
| ) |
| internal = self._internal.with_new_columns([self._psser_for(label) for label in labels]) |
| return DataFrame(internal) |
| else: |
| raise ValueError("Need to specify at least one of 'labels' or 'columns'") |
| |
| def _sort( |
| self, by: List[Column], ascending: Union[bool, List[bool]], na_position: str |
| ) -> "DataFrame": |
| if isinstance(ascending, bool): |
| ascending = [ascending] * len(by) |
| if len(ascending) != len(by): |
| raise ValueError( |
| "Length of ascending ({}) != length of by ({})".format(len(ascending), len(by)) |
| ) |
| if na_position not in ("first", "last"): |
| raise ValueError("invalid na_position: '{}'".format(na_position)) |
| |
| # Mapper: Get a spark column function for (ascending, na_position) combination |
| # Note that 'asc_nulls_first' and friends were added as of Spark 2.4, see SPARK-23847. |
| mapper = { |
| (True, "first"): lambda x: Column(getattr(x._jc, "asc_nulls_first")()), |
| (True, "last"): lambda x: Column(getattr(x._jc, "asc_nulls_last")()), |
| (False, "first"): lambda x: Column(getattr(x._jc, "desc_nulls_first")()), |
| (False, "last"): lambda x: Column(getattr(x._jc, "desc_nulls_last")()), |
| } |
| by = [mapper[(asc, na_position)](scol) for scol, asc in zip(by, ascending)] |
| sdf = self._internal.resolved_copy.spark_frame.sort(*by, NATURAL_ORDER_COLUMN_NAME) |
| return DataFrame(self._internal.with_new_sdf(sdf)) |
| |
| def sort_values( |
| self, |
| by: Union[Name, List[Name]], |
| ascending: Union[bool, List[bool]] = True, |
| inplace: bool = False, |
| na_position: str = "last", |
| ) -> Optional["DataFrame"]: |
| """ |
| Sort by the values along either axis. |
| |
| Parameters |
| ---------- |
| by : str or list of str |
| ascending : bool or list of bool, default True |
| Sort ascending vs. descending. Specify list for multiple sort |
| orders. If this is a list of bools, must match the length of |
| the by. |
| inplace : bool, default False |
| if True, perform operation in-place |
| na_position : {'first', 'last'}, default 'last' |
| `first` puts NaNs at the beginning, `last` puts NaNs at the end |
| |
| Returns |
| ------- |
| sorted_obj : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({ |
| ... 'col1': ['A', 'B', None, 'D', 'C'], |
| ... 'col2': [2, 9, 8, 7, 4], |
| ... 'col3': [0, 9, 4, 2, 3], |
| ... }, |
| ... columns=['col1', 'col2', 'col3']) |
| >>> df |
| col1 col2 col3 |
| 0 A 2 0 |
| 1 B 9 9 |
| 2 None 8 4 |
| 3 D 7 2 |
| 4 C 4 3 |
| |
| Sort by col1 |
| |
| >>> df.sort_values(by=['col1']) |
| col1 col2 col3 |
| 0 A 2 0 |
| 1 B 9 9 |
| 4 C 4 3 |
| 3 D 7 2 |
| 2 None 8 4 |
| |
| Sort Descending |
| |
| >>> df.sort_values(by='col1', ascending=False) |
| col1 col2 col3 |
| 3 D 7 2 |
| 4 C 4 3 |
| 1 B 9 9 |
| 0 A 2 0 |
| 2 None 8 4 |
| |
| Sort by multiple columns |
| |
| >>> df = ps.DataFrame({ |
| ... 'col1': ['A', 'A', 'B', None, 'D', 'C'], |
| ... 'col2': [2, 1, 9, 8, 7, 4], |
| ... 'col3': [0, 1, 9, 4, 2, 3], |
| ... }, |
| ... columns=['col1', 'col2', 'col3']) |
| >>> df.sort_values(by=['col1', 'col2']) |
| col1 col2 col3 |
| 1 A 1 1 |
| 0 A 2 0 |
| 2 B 9 9 |
| 5 C 4 3 |
| 4 D 7 2 |
| 3 None 8 4 |
| """ |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| if is_name_like_value(by): |
| by = [by] |
| else: |
| assert is_list_like(by), type(by) |
| |
| new_by = [] |
| for colname in by: |
| ser = self[colname] |
| if not isinstance(ser, ps.Series): |
| raise ValueError( |
| "The column %s is not unique. For a multi-index, the label must be a tuple " |
| "with elements corresponding to each level." % name_like_string(colname) |
| ) |
| new_by.append(ser.spark.column) |
| |
| psdf = self._sort(by=new_by, ascending=ascending, na_position=na_position) |
| if inplace: |
| self._update_internal_frame(psdf._internal) |
| return None |
| else: |
| return psdf |
| |
| def sort_index( |
| self, |
| axis: Axis = 0, |
| level: Optional[Union[int, List[int]]] = None, |
| ascending: bool = True, |
| inplace: bool = False, |
| kind: str = None, |
| na_position: str = "last", |
| ) -> Optional["DataFrame"]: |
| """ |
| Sort object by labels (along an axis) |
| |
| Parameters |
| ---------- |
| axis : index, columns to direct sorting. Currently, only axis = 0 is supported. |
| level : int or level name or list of ints or list of level names |
| if not None, sort on values in specified index level(s) |
| ascending : boolean, default True |
| Sort ascending vs. descending |
| inplace : bool, default False |
| if True, perform operation in-place |
| kind : str, default None |
| pandas-on-Spark does not allow specifying the sorting algorithm at the moment, |
| default None |
| na_position : {‘first’, ‘last’}, default ‘last’ |
| first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for |
| MultiIndex. |
| |
| Returns |
| ------- |
| sorted_obj : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [2, 1, np.nan]}, index=['b', 'a', np.nan]) |
| |
| >>> df.sort_index() |
| A |
| a 1.0 |
| b 2.0 |
| NaN NaN |
| |
| >>> df.sort_index(ascending=False) |
| A |
| b 2.0 |
| a 1.0 |
| NaN NaN |
| |
| >>> df.sort_index(na_position='first') |
| A |
| NaN NaN |
| a 1.0 |
| b 2.0 |
| |
| >>> df.sort_index(inplace=True) |
| >>> df |
| A |
| a 1.0 |
| b 2.0 |
| NaN NaN |
| |
| >>> df = ps.DataFrame({'A': range(4), 'B': range(4)[::-1]}, |
| ... index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], |
| ... columns=['A', 'B']) |
| |
| >>> df.sort_index() |
| A B |
| a 0 3 0 |
| 1 2 1 |
| b 0 1 2 |
| 1 0 3 |
| |
| >>> df.sort_index(level=1) # doctest: +SKIP |
| A B |
| a 0 3 0 |
| b 0 1 2 |
| a 1 2 1 |
| b 1 0 3 |
| |
| >>> df.sort_index(level=[1, 0]) |
| A B |
| a 0 3 0 |
| b 0 1 2 |
| a 1 2 1 |
| b 1 0 3 |
| """ |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError("No other axis than 0 are supported at the moment") |
| if kind is not None: |
| raise NotImplementedError( |
| "Specifying the sorting algorithm is not supported at the moment." |
| ) |
| |
| if level is None or (is_list_like(level) and len(level) == 0): # type: ignore |
| by = self._internal.index_spark_columns |
| elif is_list_like(level): |
| by = [self._internal.index_spark_columns[l] for l in level] # type: ignore |
| else: |
| by = [self._internal.index_spark_columns[level]] # type: ignore |
| |
| psdf = self._sort(by=by, ascending=ascending, na_position=na_position) |
| if inplace: |
| self._update_internal_frame(psdf._internal) |
| return None |
| else: |
| return psdf |
| |
| def swaplevel( |
| self, i: Union[int, Name] = -2, j: Union[int, Name] = -1, axis: Axis = 0 |
| ) -> "DataFrame": |
| """ |
| Swap levels i and j in a MultiIndex on a particular axis. |
| |
| Parameters |
| ---------- |
| i, j : int or str |
| Levels of the indices to be swapped. Can pass level name as string. |
| axis : {0 or 'index', 1 or 'columns'}, default 0 |
| The axis to swap levels on. 0 or 'index' for row-wise, 1 or |
| 'columns' for column-wise. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with levels swapped in MultiIndex. |
| |
| Examples |
| -------- |
| >>> midx = pd.MultiIndex.from_arrays( |
| ... [['red', 'blue'], [1, 2], ['s', 'm']], names = ['color', 'number', 'size']) |
| >>> midx # doctest: +SKIP |
| MultiIndex([( 'red', 1, 's'), |
| ('blue', 2, 'm')], |
| names=['color', 'number', 'size']) |
| |
| Swap levels in a MultiIndex on index. |
| |
| >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]}, index=midx) |
| >>> psdf # doctest: +NORMALIZE_WHITESPACE |
| x y |
| color number size |
| red 1 s 5 5 |
| blue 2 m 6 6 |
| |
| >>> psdf.swaplevel() # doctest: +NORMALIZE_WHITESPACE |
| x y |
| color size number |
| red s 1 5 5 |
| blue m 2 6 6 |
| |
| >>> psdf.swaplevel(0, 1) # doctest: +NORMALIZE_WHITESPACE |
| x y |
| number color size |
| 1 red s 5 5 |
| 2 blue m 6 6 |
| |
| >>> psdf.swaplevel('number', 'size') # doctest: +NORMALIZE_WHITESPACE |
| x y |
| color size number |
| red s 1 5 5 |
| blue m 2 6 6 |
| |
| Swap levels in a MultiIndex on columns. |
| |
| >>> psdf = ps.DataFrame({'x': [5, 6], 'y':[5, 6]}) |
| >>> psdf.columns = midx |
| >>> psdf |
| color red blue |
| number 1 2 |
| size s m |
| 0 5 5 |
| 1 6 6 |
| |
| >>> psdf.swaplevel(axis=1) |
| color red blue |
| size s m |
| number 1 2 |
| 0 5 5 |
| 1 6 6 |
| |
| >>> psdf.swaplevel(axis=1) |
| color red blue |
| size s m |
| number 1 2 |
| 0 5 5 |
| 1 6 6 |
| |
| >>> psdf.swaplevel(0, 1, axis=1) |
| number 1 2 |
| color red blue |
| size s m |
| 0 5 5 |
| 1 6 6 |
| |
| >>> psdf.swaplevel('number', 'color', axis=1) |
| number 1 2 |
| color red blue |
| size s m |
| 0 5 5 |
| 1 6 6 |
| """ |
| axis = validate_axis(axis) |
| if axis == 0: |
| internal = self._swaplevel_index(i, j) |
| else: |
| assert axis == 1 |
| internal = self._swaplevel_columns(i, j) |
| |
| return DataFrame(internal) |
| |
| def swapaxes(self, i: Axis, j: Axis, copy: bool = True) -> "DataFrame": |
| """ |
| Interchange axes and swap values axes appropriately. |
| |
| .. note:: This method is based on an expensive operation due to the nature |
| of big data. Internally it needs to generate each row for each value, and |
| then group twice - it is a huge operation. To prevent misusage, this method |
| has the 'compute.max_rows' default limit of input length, and raises a ValueError. |
| |
| >>> from pyspark.pandas.config import option_context |
| >>> with option_context('compute.max_rows', 1000): # doctest: +NORMALIZE_WHITESPACE |
| ... ps.DataFrame({'a': range(1001)}).swapaxes(i=0, j=1) |
| Traceback (most recent call last): |
| ... |
| ValueError: Current DataFrame has more then the given limit 1000 rows. |
| Please set 'compute.max_rows' by using 'pyspark.pandas.config.set_option' |
| to retrieve to retrieve more than 1000 rows. Note that, before changing the |
| 'compute.max_rows', this operation is considerably expensive. |
| |
| Parameters |
| ---------- |
| i: {0 or 'index', 1 or 'columns'}. The axis to swap. |
| j: {0 or 'index', 1 or 'columns'}. The axis to swap. |
| copy : bool, default True. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Examples |
| -------- |
| >>> psdf = ps.DataFrame( |
| ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['x', 'y', 'z'], columns=['a', 'b', 'c'] |
| ... ) |
| >>> psdf |
| a b c |
| x 1 2 3 |
| y 4 5 6 |
| z 7 8 9 |
| >>> psdf.swapaxes(i=1, j=0) |
| x y z |
| a 1 4 7 |
| b 2 5 8 |
| c 3 6 9 |
| >>> psdf.swapaxes(i=1, j=1) |
| a b c |
| x 1 2 3 |
| y 4 5 6 |
| z 7 8 9 |
| """ |
| assert copy is True |
| |
| i = validate_axis(i) |
| j = validate_axis(j) |
| |
| return self.copy() if i == j else self.transpose() |
| |
| def _swaplevel_columns(self, i: Union[int, Name], j: Union[int, Name]) -> InternalFrame: |
| assert isinstance(self.columns, pd.MultiIndex) |
| for index in (i, j): |
| if not isinstance(index, int) and index not in self.columns.names: |
| raise KeyError("Level %s not found" % index) |
| |
| i = i if isinstance(i, int) else self.columns.names.index(i) |
| j = j if isinstance(j, int) else self.columns.names.index(j) |
| for index in (i, j): |
| if index >= len(self.columns) or index < -len(self.columns): |
| raise IndexError( |
| "Too many levels: Columns have only %s levels, " |
| "%s is not a valid level number" % (self._internal.index_level, index) |
| ) |
| |
| column_label_names = self._internal.column_label_names.copy() |
| column_label_names[i], column_label_names[j], = ( |
| column_label_names[j], |
| column_label_names[i], |
| ) |
| column_labels = self._internal._column_labels |
| column_label_list = [list(label) for label in column_labels] |
| for label_list in column_label_list: |
| label_list[i], label_list[j] = label_list[j], label_list[i] |
| column_labels = [tuple(x) for x in column_label_list] |
| internal = self._internal.copy( |
| column_label_names=list(column_label_names), column_labels=list(column_labels) |
| ) |
| return internal |
| |
| def _swaplevel_index(self, i: Union[int, Name], j: Union[int, Name]) -> InternalFrame: |
| assert isinstance(self.index, ps.MultiIndex) |
| for index in (i, j): |
| if not isinstance(index, int) and index not in self.index.names: |
| raise KeyError("Level %s not found" % index) |
| |
| i = i if isinstance(i, int) else self.index.names.index(i) |
| j = j if isinstance(j, int) else self.index.names.index(j) |
| for index in (i, j): |
| if index >= self._internal.index_level or index < -self._internal.index_level: |
| raise IndexError( |
| "Too many levels: Index has only %s levels, " |
| "%s is not a valid level number" % (self._internal.index_level, index) |
| ) |
| |
| index_map = list( |
| zip( |
| self._internal.index_spark_columns, |
| self._internal.index_names, |
| self._internal.index_fields, |
| ) |
| ) |
| index_map[i], index_map[j] = index_map[j], index_map[i] |
| index_spark_columns, index_names, index_fields = zip(*index_map) |
| internal = self._internal.copy( |
| index_spark_columns=list(index_spark_columns), |
| index_names=list(index_names), |
| index_fields=list(index_fields), |
| ) |
| return internal |
| |
| # TODO: add keep = First |
| def nlargest(self, n: int, columns: Union[Name, List[Name]]) -> "DataFrame": |
| """ |
| Return the first `n` rows ordered by `columns` in descending order. |
| |
| Return the first `n` rows with the largest values in `columns`, in |
| descending order. The columns that are not specified are returned as |
| well, but not used for ordering. |
| |
| This method is equivalent to |
| ``df.sort_values(columns, ascending=False).head(n)``, but more |
| performant in pandas. |
| In pandas-on-Spark, thanks to Spark's lazy execution and query optimizer, |
| the two would have same performance. |
| |
| Parameters |
| ---------- |
| n : int |
| Number of rows to return. |
| columns : label or list of labels |
| Column label(s) to order by. |
| |
| Returns |
| ------- |
| DataFrame |
| The first `n` rows ordered by the given columns in descending |
| order. |
| |
| See Also |
| -------- |
| DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in |
| ascending order. |
| DataFrame.sort_values : Sort DataFrame by the values. |
| DataFrame.head : Return the first `n` rows without re-ordering. |
| |
| Notes |
| ----- |
| |
| This function cannot be used with all column types. For example, when |
| specifying columns with `object` or `category` dtypes, ``TypeError`` is |
| raised. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan], |
| ... 'Y': [6, 7, 8, 9, 10, 11, 12]}) |
| >>> df |
| X Y |
| 0 1.0 6 |
| 1 2.0 7 |
| 2 3.0 8 |
| 3 5.0 9 |
| 4 6.0 10 |
| 5 7.0 11 |
| 6 NaN 12 |
| |
| In the following example, we will use ``nlargest`` to select the three |
| rows having the largest values in column "population". |
| |
| >>> df.nlargest(n=3, columns='X') |
| X Y |
| 5 7.0 11 |
| 4 6.0 10 |
| 3 5.0 9 |
| |
| >>> df.nlargest(n=3, columns=['Y', 'X']) |
| X Y |
| 6 NaN 12 |
| 5 7.0 11 |
| 4 6.0 10 |
| |
| """ |
| return self.sort_values(by=columns, ascending=False).head(n=n) |
| |
| # TODO: add keep = First |
| def nsmallest(self, n: int, columns: Union[Name, List[Name]]) -> "DataFrame": |
| """ |
| Return the first `n` rows ordered by `columns` in ascending order. |
| |
| Return the first `n` rows with the smallest values in `columns`, in |
| ascending order. The columns that are not specified are returned as |
| well, but not used for ordering. |
| |
| This method is equivalent to ``df.sort_values(columns, ascending=True).head(n)``, |
| but more performant. In pandas-on-Spark, thanks to Spark's lazy execution and query |
| optimizer, the two would have same performance. |
| |
| Parameters |
| ---------- |
| n : int |
| Number of items to retrieve. |
| columns : list or str |
| Column name or names to order by. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| See Also |
| -------- |
| DataFrame.nlargest : Return the first `n` rows ordered by `columns` in |
| descending order. |
| DataFrame.sort_values : Sort DataFrame by the values. |
| DataFrame.head : Return the first `n` rows without re-ordering. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'X': [1, 2, 3, 5, 6, 7, np.nan], |
| ... 'Y': [6, 7, 8, 9, 10, 11, 12]}) |
| >>> df |
| X Y |
| 0 1.0 6 |
| 1 2.0 7 |
| 2 3.0 8 |
| 3 5.0 9 |
| 4 6.0 10 |
| 5 7.0 11 |
| 6 NaN 12 |
| |
| In the following example, we will use ``nsmallest`` to select the |
| three rows having the smallest values in column "a". |
| |
| >>> df.nsmallest(n=3, columns='X') # doctest: +NORMALIZE_WHITESPACE |
| X Y |
| 0 1.0 6 |
| 1 2.0 7 |
| 2 3.0 8 |
| |
| To order by the largest values in column "a" and then "c", we can |
| specify multiple columns like in the next example. |
| |
| >>> df.nsmallest(n=3, columns=['Y', 'X']) # doctest: +NORMALIZE_WHITESPACE |
| X Y |
| 0 1.0 6 |
| 1 2.0 7 |
| 2 3.0 8 |
| """ |
| return self.sort_values(by=columns, ascending=True).head(n=n) |
| |
| def isin(self, values: Union[List, Dict]) -> "DataFrame": |
| """ |
| Whether each element in the DataFrame is contained in values. |
| |
| Parameters |
| ---------- |
| values : iterable or dict |
| The sequence of values to test. If values is a dict, |
| the keys must be the column names, which must match. |
| Series and DataFrame are not supported. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame of booleans showing whether each element in the DataFrame |
| is contained in values. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, |
| ... index=['falcon', 'dog'], |
| ... columns=['num_legs', 'num_wings']) |
| >>> df |
| num_legs num_wings |
| falcon 2 2 |
| dog 4 0 |
| |
| When ``values`` is a list check whether every value in the DataFrame |
| is present in the list (which animals have 0 or 2 legs or wings) |
| |
| >>> df.isin([0, 2]) |
| num_legs num_wings |
| falcon True True |
| dog False True |
| |
| When ``values`` is a dict, we can pass values to check for each |
| column separately: |
| |
| >>> df.isin({'num_wings': [0, 3]}) |
| num_legs num_wings |
| falcon False False |
| dog False True |
| """ |
| if isinstance(values, (pd.DataFrame, pd.Series)): |
| raise NotImplementedError("DataFrame and Series are not supported") |
| if isinstance(values, dict) and not set(values.keys()).issubset(self.columns): |
| raise AttributeError( |
| "'DataFrame' object has no attribute %s" |
| % (set(values.keys()).difference(self.columns)) |
| ) |
| |
| data_spark_columns = [] |
| if isinstance(values, dict): |
| for i, col in enumerate(self.columns): |
| if col in values: |
| item = values[col] |
| item = item.tolist() if isinstance(item, np.ndarray) else list(item) |
| data_spark_columns.append( |
| self._internal.spark_column_for(self._internal.column_labels[i]) |
| .isin(item) |
| .alias(self._internal.data_spark_column_names[i]) |
| ) |
| else: |
| data_spark_columns.append( |
| SF.lit(False).alias(self._internal.data_spark_column_names[i]) |
| ) |
| elif is_list_like(values): |
| values = ( |
| cast(np.ndarray, values).tolist() |
| if isinstance(values, np.ndarray) |
| else list(values) |
| ) |
| data_spark_columns += [ |
| self._internal.spark_column_for(label) |
| .isin(values) |
| .alias(self._internal.spark_column_name_for(label)) |
| for label in self._internal.column_labels |
| ] |
| else: |
| raise TypeError("Values should be iterable, Series, DataFrame or dict.") |
| |
| return DataFrame(self._internal.with_new_columns(data_spark_columns)) |
| |
| @property |
| def shape(self) -> Tuple[int, int]: |
| """ |
| Return a tuple representing the dimensionality of the DataFrame. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) |
| >>> df.shape |
| (2, 2) |
| |
| >>> df = ps.DataFrame({'col1': [1, 2], 'col2': [3, 4], |
| ... 'col3': [5, 6]}) |
| >>> df.shape |
| (2, 3) |
| """ |
| return len(self), len(self.columns) |
| |
| def merge( |
| self, |
| right: "DataFrame", |
| how: str = "inner", |
| on: Optional[Union[Name, List[Name]]] = None, |
| left_on: Optional[Union[Name, List[Name]]] = None, |
| right_on: Optional[Union[Name, List[Name]]] = None, |
| left_index: bool = False, |
| right_index: bool = False, |
| suffixes: Tuple[str, str] = ("_x", "_y"), |
| ) -> "DataFrame": |
| """ |
| Merge DataFrame objects with a database-style join. |
| |
| The index of the resulting DataFrame will be one of the following: |
| - 0...n if no index is used for merging |
| - Index of the left DataFrame if merged only on the index of the right DataFrame |
| - Index of the right DataFrame if merged only on the index of the left DataFrame |
| - All involved indices if merged using the indices of both DataFrames |
| e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will |
| be an index (x, a, b) |
| |
| Parameters |
| ---------- |
| right: Object to merge with. |
| how: Type of merge to be performed. |
| {'left', 'right', 'outer', 'inner'}, default 'inner' |
| |
| left: use only keys from left frame, similar to a SQL left outer join; not preserve |
| key order unlike pandas. |
| right: use only keys from right frame, similar to a SQL right outer join; not preserve |
| key order unlike pandas. |
| outer: use union of keys from both frames, similar to a SQL full outer join; sort keys |
| lexicographically. |
| inner: use intersection of keys from both frames, similar to a SQL inner join; |
| not preserve the order of the left keys unlike pandas. |
| on: Column or index level names to join on. These must be found in both DataFrames. If on |
| is None and not merging on indexes then this defaults to the intersection of the |
| columns in both DataFrames. |
| left_on: Column or index level names to join on in the left DataFrame. Can also |
| be an array or list of arrays of the length of the left DataFrame. |
| These arrays are treated as if they are columns. |
| right_on: Column or index level names to join on in the right DataFrame. Can also |
| be an array or list of arrays of the length of the right DataFrame. |
| These arrays are treated as if they are columns. |
| left_index: Use the index from the left DataFrame as the join key(s). If it is a |
| MultiIndex, the number of keys in the other DataFrame (either the index or a number of |
| columns) must match the number of levels. |
| right_index: Use the index from the right DataFrame as the join key. Same caveats as |
| left_index. |
| suffixes: Suffix to apply to overlapping column names in the left and right side, |
| respectively. |
| |
| Returns |
| ------- |
| DataFrame |
| A DataFrame of the two merged objects. |
| |
| See Also |
| -------- |
| DataFrame.join : Join columns of another DataFrame. |
| DataFrame.update : Modify in place using non-NA values from another DataFrame. |
| DataFrame.hint : Specifies some hint on the current DataFrame. |
| broadcast : Marks a DataFrame as small enough for use in broadcast joins. |
| |
| Examples |
| -------- |
| >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], |
| ... 'value': [1, 2, 3, 5]}, |
| ... columns=['lkey', 'value']) |
| >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], |
| ... 'value': [5, 6, 7, 8]}, |
| ... columns=['rkey', 'value']) |
| >>> df1 |
| lkey value |
| 0 foo 1 |
| 1 bar 2 |
| 2 baz 3 |
| 3 foo 5 |
| >>> df2 |
| rkey value |
| 0 foo 5 |
| 1 bar 6 |
| 2 baz 7 |
| 3 foo 8 |
| |
| Merge df1 and df2 on the lkey and rkey columns. The value columns have |
| the default suffixes, _x and _y, appended. |
| |
| >>> merged = df1.merge(df2, left_on='lkey', right_on='rkey') |
| >>> merged.sort_values(by=['lkey', 'value_x', 'rkey', 'value_y']) # doctest: +ELLIPSIS |
| lkey value_x rkey value_y |
| ...bar 2 bar 6 |
| ...baz 3 baz 7 |
| ...foo 1 foo 5 |
| ...foo 1 foo 8 |
| ...foo 5 foo 5 |
| ...foo 5 foo 8 |
| |
| >>> left_psdf = ps.DataFrame({'A': [1, 2]}) |
| >>> right_psdf = ps.DataFrame({'B': ['x', 'y']}, index=[1, 2]) |
| |
| >>> left_psdf.merge(right_psdf, left_index=True, right_index=True).sort_index() |
| A B |
| 1 2 x |
| |
| >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='left').sort_index() |
| A B |
| 0 1 None |
| 1 2 x |
| |
| >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='right').sort_index() |
| A B |
| 1 2.0 x |
| 2 NaN y |
| |
| >>> left_psdf.merge(right_psdf, left_index=True, right_index=True, how='outer').sort_index() |
| A B |
| 0 1.0 None |
| 1 2.0 x |
| 2 NaN y |
| |
| Notes |
| ----- |
| As described in #263, joining string columns currently returns None for missing values |
| instead of NaN. |
| """ |
| |
| def to_list(os: Optional[Union[Name, List[Name]]]) -> List[Label]: |
| if os is None: |
| return [] |
| elif is_name_like_tuple(os): |
| return [os] # type: ignore |
| elif is_name_like_value(os): |
| return [(os,)] |
| else: |
| return [o if is_name_like_tuple(o) else (o,) for o in os] |
| |
| if isinstance(right, ps.Series): |
| right = right.to_frame() |
| |
| if on: |
| if left_on or right_on: |
| raise ValueError( |
| 'Can only pass argument "on" OR "left_on" and "right_on", ' |
| "not a combination of both." |
| ) |
| left_key_names = list(map(self._internal.spark_column_name_for, to_list(on))) |
| right_key_names = list(map(right._internal.spark_column_name_for, to_list(on))) |
| else: |
| # TODO: need special handling for multi-index. |
| if left_index: |
| left_key_names = self._internal.index_spark_column_names |
| else: |
| left_key_names = list(map(self._internal.spark_column_name_for, to_list(left_on))) |
| if right_index: |
| right_key_names = right._internal.index_spark_column_names |
| else: |
| right_key_names = list( |
| map(right._internal.spark_column_name_for, to_list(right_on)) |
| ) |
| |
| if left_key_names and not right_key_names: |
| raise ValueError("Must pass right_on or right_index=True") |
| if right_key_names and not left_key_names: |
| raise ValueError("Must pass left_on or left_index=True") |
| if not left_key_names and not right_key_names: |
| common = list(self.columns.intersection(right.columns)) |
| if len(common) == 0: |
| raise ValueError( |
| "No common columns to perform merge on. Merge options: " |
| "left_on=None, right_on=None, left_index=False, right_index=False" |
| ) |
| left_key_names = list(map(self._internal.spark_column_name_for, to_list(common))) |
| right_key_names = list(map(right._internal.spark_column_name_for, to_list(common))) |
| if len(left_key_names) != len(right_key_names): |
| raise ValueError("len(left_keys) must equal len(right_keys)") |
| |
| # We should distinguish the name to avoid ambiguous column name after merging. |
| right_prefix = "__right_" |
| right_key_names = [right_prefix + right_key_name for right_key_name in right_key_names] |
| |
| how = validate_how(how) |
| |
| def resolve(internal: InternalFrame, side: str) -> InternalFrame: |
| rename = lambda col: "__{}_{}".format(side, col) |
| internal = internal.resolved_copy |
| sdf = internal.spark_frame |
| sdf = sdf.select( |
| *[ |
| scol_for(sdf, col).alias(rename(col)) |
| for col in sdf.columns |
| if col not in HIDDEN_COLUMNS |
| ], |
| *HIDDEN_COLUMNS, |
| ) |
| return internal.copy( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, rename(col)) for col in internal.index_spark_column_names |
| ], |
| index_fields=[ |
| field.copy(name=rename(field.name)) for field in internal.index_fields |
| ], |
| data_spark_columns=[ |
| scol_for(sdf, rename(col)) for col in internal.data_spark_column_names |
| ], |
| data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields], |
| ) |
| |
| left_internal = self._internal.resolved_copy |
| right_internal = resolve(right._internal, "right") |
| |
| left_table = left_internal.spark_frame.alias("left_table") |
| right_table = right_internal.spark_frame.alias("right_table") |
| |
| left_key_columns = [scol_for(left_table, label) for label in left_key_names] |
| right_key_columns = [scol_for(right_table, label) for label in right_key_names] |
| |
| join_condition = reduce( |
| lambda x, y: x & y, |
| [lkey == rkey for lkey, rkey in zip(left_key_columns, right_key_columns)], |
| ) |
| |
| joined_table = left_table.join(right_table, join_condition, how=how) |
| |
| # Unpack suffixes tuple for convenience |
| left_suffix = suffixes[0] |
| right_suffix = suffixes[1] |
| |
| # Append suffixes to columns with the same name to avoid conflicts later |
| duplicate_columns = set(left_internal.column_labels) & set(right_internal.column_labels) |
| |
| exprs = [] |
| data_columns = [] |
| column_labels = [] |
| |
| left_scol_for = lambda label: scol_for( |
| left_table, left_internal.spark_column_name_for(label) |
| ) |
| right_scol_for = lambda label: scol_for( |
| right_table, right_internal.spark_column_name_for(label) |
| ) |
| |
| for label in left_internal.column_labels: |
| col = left_internal.spark_column_name_for(label) |
| scol = left_scol_for(label) |
| if label in duplicate_columns: |
| spark_column_name = left_internal.spark_column_name_for(label) |
| if ( |
| spark_column_name in left_key_names |
| and (right_prefix + spark_column_name) in right_key_names |
| ): |
| right_scol = right_scol_for(label) |
| if how == "right": |
| scol = right_scol.alias(col) |
| elif how == "full": |
| scol = F.when(scol.isNotNull(), scol).otherwise(right_scol).alias(col) |
| else: |
| pass |
| else: |
| col = col + left_suffix |
| scol = scol.alias(col) |
| label = tuple([str(label[0]) + left_suffix] + list(label[1:])) |
| exprs.append(scol) |
| data_columns.append(col) |
| column_labels.append(label) |
| for label in right_internal.column_labels: |
| # recover `right_prefix` here. |
| col = right_internal.spark_column_name_for(label)[len(right_prefix) :] |
| scol = right_scol_for(label).alias(col) |
| if label in duplicate_columns: |
| spark_column_name = left_internal.spark_column_name_for(label) |
| if ( |
| spark_column_name in left_key_names |
| and (right_prefix + spark_column_name) in right_key_names |
| ): |
| continue |
| else: |
| col = col + right_suffix |
| scol = scol.alias(col) |
| label = tuple([str(label[0]) + right_suffix] + list(label[1:])) |
| exprs.append(scol) |
| data_columns.append(col) |
| column_labels.append(label) |
| |
| left_index_scols = left_internal.index_spark_columns |
| right_index_scols = right_internal.index_spark_columns |
| |
| # Retain indices if they are used for joining |
| if left_index: |
| if right_index: |
| if how in ("inner", "left"): |
| exprs.extend(left_index_scols) |
| index_spark_column_names = left_internal.index_spark_column_names |
| index_names = left_internal.index_names |
| elif how == "right": |
| exprs.extend(right_index_scols) |
| index_spark_column_names = right_internal.index_spark_column_names |
| index_names = right_internal.index_names |
| else: |
| index_spark_column_names = left_internal.index_spark_column_names |
| index_names = left_internal.index_names |
| for col, left_scol, right_scol in zip( |
| index_spark_column_names, left_index_scols, right_index_scols |
| ): |
| scol = F.when(left_scol.isNotNull(), left_scol).otherwise(right_scol) |
| exprs.append(scol.alias(col)) |
| else: |
| exprs.extend(right_index_scols) |
| index_spark_column_names = right_internal.index_spark_column_names |
| index_names = right_internal.index_names |
| elif right_index: |
| exprs.extend(left_index_scols) |
| index_spark_column_names = left_internal.index_spark_column_names |
| index_names = left_internal.index_names |
| else: |
| index_spark_column_names = [] |
| index_names = [] |
| |
| selected_columns = joined_table.select(*exprs) |
| |
| internal = InternalFrame( |
| spark_frame=selected_columns, |
| index_spark_columns=[ |
| scol_for(selected_columns, col) for col in index_spark_column_names |
| ], |
| index_names=index_names, |
| column_labels=column_labels, |
| data_spark_columns=[scol_for(selected_columns, col) for col in data_columns], |
| ) |
| return DataFrame(internal) |
| |
| def join( |
| self, |
| right: "DataFrame", |
| on: Optional[Union[Name, List[Name]]] = None, |
| how: str = "left", |
| lsuffix: str = "", |
| rsuffix: str = "", |
| ) -> "DataFrame": |
| """ |
| Join columns of another DataFrame. |
| |
| Join columns with `right` DataFrame either on index or on a key column. Efficiently join |
| multiple DataFrame objects by index at once by passing a list. |
| |
| Parameters |
| ---------- |
| right: DataFrame, Series |
| on: str, list of str, or array-like, optional |
| Column or index level name(s) in the caller to join on the index in `right`, otherwise |
| joins index-on-index. If multiple values given, the `right` DataFrame must have a |
| MultiIndex. Can pass an array as the join key if it is not already contained in the |
| calling DataFrame. Like an Excel VLOOKUP operation. |
| how: {'left', 'right', 'outer', 'inner'}, default 'left' |
| How to handle the operation of the two objects. |
| |
| * left: use `left` frame’s index (or column if on is specified). |
| * right: use `right`’s index. |
| * outer: form union of `left` frame’s index (or column if on is specified) with |
| right’s index, and sort it. lexicographically. |
| * inner: form intersection of `left` frame’s index (or column if on is specified) |
| with `right`’s index, preserving the order of the `left`’s one. |
| lsuffix : str, default '' |
| Suffix to use from left frame's overlapping columns. |
| rsuffix : str, default '' |
| Suffix to use from `right` frame's overlapping columns. |
| |
| Returns |
| ------- |
| DataFrame |
| A dataframe containing columns from both the `left` and `right`. |
| |
| See Also |
| -------- |
| DataFrame.merge: For column(s)-on-columns(s) operations. |
| DataFrame.update : Modify in place using non-NA values from another DataFrame. |
| DataFrame.hint : Specifies some hint on the current DataFrame. |
| broadcast : Marks a DataFrame as small enough for use in broadcast joins. |
| |
| Notes |
| ----- |
| Parameters on, lsuffix, and rsuffix are not supported when passing a list of DataFrame |
| objects. |
| |
| Examples |
| -------- |
| >>> psdf1 = ps.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], |
| ... 'A': ['A0', 'A1', 'A2', 'A3']}, |
| ... columns=['key', 'A']) |
| >>> psdf2 = ps.DataFrame({'key': ['K0', 'K1', 'K2'], |
| ... 'B': ['B0', 'B1', 'B2']}, |
| ... columns=['key', 'B']) |
| >>> psdf1 |
| key A |
| 0 K0 A0 |
| 1 K1 A1 |
| 2 K2 A2 |
| 3 K3 A3 |
| >>> psdf2 |
| key B |
| 0 K0 B0 |
| 1 K1 B1 |
| 2 K2 B2 |
| |
| Join DataFrames using their indexes. |
| |
| >>> join_psdf = psdf1.join(psdf2, lsuffix='_left', rsuffix='_right') |
| >>> join_psdf.sort_values(by=join_psdf.columns) |
| key_left A key_right B |
| 0 K0 A0 K0 B0 |
| 1 K1 A1 K1 B1 |
| 2 K2 A2 K2 B2 |
| 3 K3 A3 None None |
| |
| If we want to join using the key columns, we need to set key to be the index in both df and |
| right. The joined DataFrame will have key as its index. |
| |
| >>> join_psdf = psdf1.set_index('key').join(psdf2.set_index('key')) |
| >>> join_psdf.sort_values(by=join_psdf.columns) # doctest: +NORMALIZE_WHITESPACE |
| A B |
| key |
| K0 A0 B0 |
| K1 A1 B1 |
| K2 A2 B2 |
| K3 A3 None |
| |
| Another option to join using the key columns is to use the on parameter. DataFrame.join |
| always uses right’s index but we can use any column in df. This method not preserve the |
| original DataFrame’s index in the result unlike pandas. |
| |
| >>> join_psdf = psdf1.join(psdf2.set_index('key'), on='key') |
| >>> join_psdf.index |
| Int64Index([0, 1, 2, 3], dtype='int64') |
| """ |
| if isinstance(right, ps.Series): |
| common = list(self.columns.intersection([right.name])) |
| else: |
| common = list(self.columns.intersection(right.columns)) |
| if len(common) > 0 and not lsuffix and not rsuffix: |
| raise ValueError( |
| "columns overlap but no suffix specified: " "{rename}".format(rename=common) |
| ) |
| |
| need_set_index = False |
| if on: |
| if not is_list_like(on): |
| on = [on] # type: ignore |
| if len(on) != right._internal.index_level: |
| raise ValueError( |
| 'len(left_on) must equal the number of levels in the index of "right"' |
| ) |
| |
| need_set_index = len(set(on) & set(self.index.names)) == 0 |
| if need_set_index: |
| self = self.set_index(on) |
| join_psdf = self.merge( |
| right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix) |
| ) |
| return join_psdf.reset_index() if need_set_index else join_psdf |
| |
| def append( |
| self, |
| other: "DataFrame", |
| ignore_index: bool = False, |
| verify_integrity: bool = False, |
| sort: bool = False, |
| ) -> "DataFrame": |
| """ |
| Append rows of other to the end of caller, returning a new object. |
| |
| Columns in other that are not in the caller are added as new columns. |
| |
| Parameters |
| ---------- |
| other : DataFrame or Series/dict-like object, or list of these |
| The data to append. |
| |
| ignore_index : boolean, default False |
| If True, do not use the index labels. |
| |
| verify_integrity : boolean, default False |
| If True, raise ValueError on creating index with duplicates. |
| |
| sort : boolean, default False |
| Currently not supported. |
| |
| Returns |
| ------- |
| appended : DataFrame |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([[1, 2], [3, 4]], columns=list('AB')) |
| |
| >>> df.append(df) |
| A B |
| 0 1 2 |
| 1 3 4 |
| 0 1 2 |
| 1 3 4 |
| |
| >>> df.append(df, ignore_index=True) |
| A B |
| 0 1 2 |
| 1 3 4 |
| 2 1 2 |
| 3 3 4 |
| """ |
| if isinstance(other, ps.Series): |
| raise TypeError("DataFrames.append() does not support appending Series to DataFrames") |
| if sort: |
| raise NotImplementedError("The 'sort' parameter is currently not supported") |
| |
| if not ignore_index: |
| index_scols = self._internal.index_spark_columns |
| if len(index_scols) != other._internal.index_level: |
| raise ValueError("Both DataFrames have to have the same number of index levels") |
| |
| if verify_integrity and len(index_scols) > 0: |
| if ( |
| self._internal.spark_frame.select(index_scols) |
| .intersect( |
| other._internal.spark_frame.select(other._internal.index_spark_columns) |
| ) |
| .count() |
| ) > 0: |
| raise ValueError("Indices have overlapping values") |
| |
| # Lazy import to avoid circular dependency issues |
| from pyspark.pandas.namespace import concat |
| |
| return cast(DataFrame, concat([self, other], ignore_index=ignore_index)) |
| |
| # TODO: add 'filter_func' and 'errors' parameter |
| def update(self, other: "DataFrame", join: str = "left", overwrite: bool = True) -> None: |
| """ |
| Modify in place using non-NA values from another DataFrame. |
| Aligns on indices. There is no return value. |
| |
| Parameters |
| ---------- |
| other : DataFrame, or Series |
| join : 'left', default 'left' |
| Only left join is implemented, keeping the index and columns of the original object. |
| overwrite : bool, default True |
| How to handle non-NA values for overlapping keys: |
| |
| * True: overwrite original DataFrame's values with values from `other`. |
| * False: only update values that are NA in the original DataFrame. |
| |
| Returns |
| ------- |
| None : method directly changes calling object |
| |
| See Also |
| -------- |
| DataFrame.merge : For column(s)-on-columns(s) operations. |
| DataFrame.join : Join columns of another DataFrame. |
| DataFrame.hint : Specifies some hint on the current DataFrame. |
| broadcast : Marks a DataFrame as small enough for use in broadcast joins. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B']) |
| >>> new_df = ps.DataFrame({'B': [4, 5, 6], 'C': [7, 8, 9]}, columns=['B', 'C']) |
| >>> df.update(new_df) |
| >>> df.sort_index() |
| A B |
| 0 1 4 |
| 1 2 5 |
| 2 3 6 |
| |
| The DataFrame's length does not increase as a result of the update, |
| only values at matching index/column labels are updated. |
| |
| >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B']) |
| >>> new_df = ps.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}, columns=['B']) |
| >>> df.update(new_df) |
| >>> df.sort_index() |
| A B |
| 0 a d |
| 1 b e |
| 2 c f |
| |
| For Series, it's name attribute must be set. |
| |
| >>> df = ps.DataFrame({'A': ['a', 'b', 'c'], 'B': ['x', 'y', 'z']}, columns=['A', 'B']) |
| >>> new_column = ps.Series(['d', 'e'], name='B', index=[0, 2]) |
| >>> df.update(new_column) |
| >>> df.sort_index() |
| A B |
| 0 a d |
| 1 b y |
| 2 c e |
| |
| If `other` contains None the corresponding values are not updated in the original dataframe. |
| |
| >>> df = ps.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]}, columns=['A', 'B']) |
| >>> new_df = ps.DataFrame({'B': [4, None, 6]}, columns=['B']) |
| >>> df.update(new_df) |
| >>> df.sort_index() |
| A B |
| 0 1 4.0 |
| 1 2 500.0 |
| 2 3 6.0 |
| """ |
| if join != "left": |
| raise NotImplementedError("Only left join is supported") |
| |
| if isinstance(other, ps.Series): |
| other = other.to_frame() |
| |
| update_columns = list( |
| set(self._internal.column_labels).intersection(set(other._internal.column_labels)) |
| ) |
| update_sdf = self.join( |
| other[update_columns], rsuffix="_new" |
| )._internal.resolved_copy.spark_frame |
| |
| data_fields = self._internal.data_fields.copy() |
| for column_labels in update_columns: |
| column_name = self._internal.spark_column_name_for(column_labels) |
| old_col = scol_for(update_sdf, column_name) |
| new_col = scol_for( |
| update_sdf, other._internal.spark_column_name_for(column_labels) + "_new" |
| ) |
| if overwrite: |
| update_sdf = update_sdf.withColumn( |
| column_name, F.when(new_col.isNull(), old_col).otherwise(new_col) |
| ) |
| else: |
| update_sdf = update_sdf.withColumn( |
| column_name, F.when(old_col.isNull(), new_col).otherwise(old_col) |
| ) |
| data_fields[self._internal.column_labels.index(column_labels)] = None |
| sdf = update_sdf.select( |
| *[scol_for(update_sdf, col) for col in self._internal.spark_column_names], |
| *HIDDEN_COLUMNS, |
| ) |
| internal = self._internal.with_new_sdf(sdf, data_fields=data_fields) |
| self._update_internal_frame(internal, requires_same_anchor=False) |
| |
| def sample( |
| self, |
| n: Optional[int] = None, |
| frac: Optional[float] = None, |
| replace: bool = False, |
| random_state: Optional[int] = None, |
| ) -> "DataFrame": |
| """ |
| Return a random sample of items from an axis of object. |
| |
| Please call this function using named argument by specifying the ``frac`` argument. |
| |
| You can use `random_state` for reproducibility. However, note that different from pandas, |
| specifying a seed in pandas-on-Spark/Spark does not guarantee the sampled rows will |
| be fixed. The result set depends on not only the seed, but also how the data is distributed |
| across machines and to some extent network randomness when shuffle operations are involved. |
| Even in the simplest case, the result set will depend on the system's CPU core count. |
| |
| Parameters |
| ---------- |
| n : int, optional |
| Number of items to return. This is currently NOT supported. Use frac instead. |
| frac : float, optional |
| Fraction of axis items to return. |
| replace : bool, default False |
| Sample with or without replacement. |
| random_state : int, optional |
| Seed for the random number generator (if int). |
| |
| Returns |
| ------- |
| Series or DataFrame |
| A new object of same type as caller containing the sampled items. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'num_legs': [2, 4, 8, 0], |
| ... 'num_wings': [2, 0, 0, 0], |
| ... 'num_specimen_seen': [10, 2, 1, 8]}, |
| ... index=['falcon', 'dog', 'spider', 'fish'], |
| ... columns=['num_legs', 'num_wings', 'num_specimen_seen']) |
| >>> df # doctest: +SKIP |
| num_legs num_wings num_specimen_seen |
| falcon 2 2 10 |
| dog 4 0 2 |
| spider 8 0 1 |
| fish 0 0 8 |
| |
| A random 25% sample of the ``DataFrame``. |
| Note that we use `random_state` to ensure the reproducibility of |
| the examples. |
| |
| >>> df.sample(frac=0.25, random_state=1) # doctest: +SKIP |
| num_legs num_wings num_specimen_seen |
| falcon 2 2 10 |
| fish 0 0 8 |
| |
| Extract 25% random elements from the ``Series`` ``df['num_legs']``, with replacement, |
| so the same items could appear more than once. |
| |
| >>> df['num_legs'].sample(frac=0.4, replace=True, random_state=1) # doctest: +SKIP |
| falcon 2 |
| spider 8 |
| spider 8 |
| Name: num_legs, dtype: int64 |
| |
| Specifying the exact number of items to return is not supported at the moment. |
| |
| >>> df.sample(n=5) # doctest: +ELLIPSIS |
| Traceback (most recent call last): |
| ... |
| NotImplementedError: Function sample currently does not support specifying ... |
| """ |
| # Note: we don't run any of the doctests because the result can change depending on the |
| # system's core count. |
| if n is not None: |
| raise NotImplementedError( |
| "Function sample currently does not support specifying " |
| "exact number of items to return. Use frac instead." |
| ) |
| |
| if frac is None: |
| raise ValueError("frac must be specified.") |
| |
| sdf = self._internal.resolved_copy.spark_frame.sample( |
| withReplacement=replace, fraction=frac, seed=random_state |
| ) |
| return DataFrame(self._internal.with_new_sdf(sdf)) |
| |
| def astype(self, dtype: Union[str, Dtype, Dict[Name, Union[str, Dtype]]]) -> "DataFrame": |
| """ |
| Cast a pandas-on-Spark object to a specified dtype ``dtype``. |
| |
| Parameters |
| ---------- |
| dtype : data type, or dict of column name -> data type |
| Use a numpy.dtype or Python type to cast entire pandas-on-Spark object to |
| the same type. Alternatively, use {col: dtype, ...}, where col is a |
| column label and dtype is a numpy.dtype or Python type to cast one |
| or more of the DataFrame's columns to column-specific types. |
| |
| Returns |
| ------- |
| casted : same type as caller |
| |
| See Also |
| -------- |
| to_datetime : Convert argument to datetime. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3]}, dtype='int64') |
| >>> df |
| a b |
| 0 1 1 |
| 1 2 2 |
| 2 3 3 |
| |
| Convert to float type: |
| |
| >>> df.astype('float') |
| a b |
| 0 1.0 1.0 |
| 1 2.0 2.0 |
| 2 3.0 3.0 |
| |
| Convert to int64 type back: |
| |
| >>> df.astype('int64') |
| a b |
| 0 1 1 |
| 1 2 2 |
| 2 3 3 |
| |
| Convert column a to float type: |
| |
| >>> df.astype({'a': float}) |
| a b |
| 0 1.0 1 |
| 1 2.0 2 |
| 2 3.0 3 |
| |
| """ |
| applied = [] |
| if is_dict_like(dtype): |
| dtype_dict = cast(Dict[Name, Union[str, Dtype]], dtype) |
| for col_name in dtype_dict.keys(): |
| if col_name not in self.columns: |
| raise KeyError( |
| "Only a column name can be used for the " |
| "key in a dtype mappings argument." |
| ) |
| for col_name, col in self.items(): |
| if col_name in dtype_dict: |
| applied.append(col.astype(dtype=dtype_dict[col_name])) |
| else: |
| applied.append(col) |
| else: |
| for col_name, col in self.items(): |
| applied.append(col.astype(dtype=dtype)) |
| return DataFrame(self._internal.with_new_columns(applied)) |
| |
| def add_prefix(self, prefix: str) -> "DataFrame": |
| """ |
| Prefix labels with string `prefix`. |
| |
| For Series, the row labels are prefixed. |
| For DataFrame, the column labels are prefixed. |
| |
| Parameters |
| ---------- |
| prefix : str |
| The string to add before each label. |
| |
| Returns |
| ------- |
| DataFrame |
| New DataFrame with updated labels. |
| |
| See Also |
| -------- |
| Series.add_prefix: Prefix row labels with string `prefix`. |
| Series.add_suffix: Suffix row labels with string `suffix`. |
| DataFrame.add_suffix: Suffix column labels with string `suffix`. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B']) |
| >>> df |
| A B |
| 0 1 3 |
| 1 2 4 |
| 2 3 5 |
| 3 4 6 |
| |
| >>> df.add_prefix('col_') |
| col_A col_B |
| 0 1 3 |
| 1 2 4 |
| 2 3 5 |
| 3 4 6 |
| """ |
| assert isinstance(prefix, str) |
| return self._apply_series_op( |
| lambda psser: psser.rename(tuple([prefix + i for i in psser._column_label])) |
| ) |
| |
| def add_suffix(self, suffix: str) -> "DataFrame": |
| """ |
| Suffix labels with string `suffix`. |
| |
| For Series, the row labels are suffixed. |
| For DataFrame, the column labels are suffixed. |
| |
| Parameters |
| ---------- |
| suffix : str |
| The string to add before each label. |
| |
| Returns |
| ------- |
| DataFrame |
| New DataFrame with updated labels. |
| |
| See Also |
| -------- |
| Series.add_prefix: Prefix row labels with string `prefix`. |
| Series.add_suffix: Suffix row labels with string `suffix`. |
| DataFrame.add_prefix: Prefix column labels with string `prefix`. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}, columns=['A', 'B']) |
| >>> df |
| A B |
| 0 1 3 |
| 1 2 4 |
| 2 3 5 |
| 3 4 6 |
| |
| >>> df.add_suffix('_col') |
| A_col B_col |
| 0 1 3 |
| 1 2 4 |
| 2 3 5 |
| 3 4 6 |
| """ |
| assert isinstance(suffix, str) |
| return self._apply_series_op( |
| lambda psser: psser.rename(tuple([i + suffix for i in psser._column_label])) |
| ) |
| |
| # TODO: include, and exclude should be implemented. |
| def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame": |
| """ |
| Generate descriptive statistics that summarize the central tendency, |
| dispersion and shape of a dataset's distribution, excluding |
| ``NaN`` values. |
| |
| Analyzes both numeric and object series, as well |
| as ``DataFrame`` column sets of mixed data types. The output |
| will vary depending on what is provided. Refer to the notes |
| below for more detail. |
| |
| Parameters |
| ---------- |
| percentiles : list of ``float`` in range [0.0, 1.0], default [0.25, 0.5, 0.75] |
| A list of percentiles to be computed. |
| |
| Returns |
| ------- |
| DataFrame |
| Summary statistics of the Dataframe provided. |
| |
| See Also |
| -------- |
| DataFrame.count: Count number of non-NA/null observations. |
| DataFrame.max: Maximum of the values in the object. |
| DataFrame.min: Minimum of the values in the object. |
| DataFrame.mean: Mean of the values. |
| DataFrame.std: Standard deviation of the observations. |
| |
| Notes |
| ----- |
| For numeric data, the result's index will include ``count``, |
| ``mean``, ``std``, ``min``, ``25%``, ``50%``, ``75%``, ``max``. |
| |
| Currently only numeric data is supported. |
| |
| Examples |
| -------- |
| Describing a numeric ``Series``. |
| |
| >>> s = ps.Series([1, 2, 3]) |
| >>> s.describe() |
| count 3.0 |
| mean 2.0 |
| std 1.0 |
| min 1.0 |
| 25% 1.0 |
| 50% 2.0 |
| 75% 3.0 |
| max 3.0 |
| dtype: float64 |
| |
| Describing a ``DataFrame``. Only numeric fields are returned. |
| |
| >>> df = ps.DataFrame({'numeric1': [1, 2, 3], |
| ... 'numeric2': [4.0, 5.0, 6.0], |
| ... 'object': ['a', 'b', 'c'] |
| ... }, |
| ... columns=['numeric1', 'numeric2', 'object']) |
| >>> df.describe() |
| numeric1 numeric2 |
| count 3.0 3.0 |
| mean 2.0 5.0 |
| std 1.0 1.0 |
| min 1.0 4.0 |
| 25% 1.0 4.0 |
| 50% 2.0 5.0 |
| 75% 3.0 6.0 |
| max 3.0 6.0 |
| |
| For multi-index columns: |
| |
| >>> df.columns = [('num', 'a'), ('num', 'b'), ('obj', 'c')] |
| >>> df.describe() # doctest: +NORMALIZE_WHITESPACE |
| num |
| a b |
| count 3.0 3.0 |
| mean 2.0 5.0 |
| std 1.0 1.0 |
| min 1.0 4.0 |
| 25% 1.0 4.0 |
| 50% 2.0 5.0 |
| 75% 3.0 6.0 |
| max 3.0 6.0 |
| |
| >>> df[('num', 'b')].describe() |
| count 3.0 |
| mean 5.0 |
| std 1.0 |
| min 4.0 |
| 25% 4.0 |
| 50% 5.0 |
| 75% 6.0 |
| max 6.0 |
| Name: (num, b), dtype: float64 |
| |
| Describing a ``DataFrame`` and selecting custom percentiles. |
| |
| >>> df = ps.DataFrame({'numeric1': [1, 2, 3], |
| ... 'numeric2': [4.0, 5.0, 6.0] |
| ... }, |
| ... columns=['numeric1', 'numeric2']) |
| >>> df.describe(percentiles = [0.85, 0.15]) |
| numeric1 numeric2 |
| count 3.0 3.0 |
| mean 2.0 5.0 |
| std 1.0 1.0 |
| min 1.0 4.0 |
| 15% 1.0 4.0 |
| 50% 2.0 5.0 |
| 85% 3.0 6.0 |
| max 3.0 6.0 |
| |
| Describing a column from a ``DataFrame`` by accessing it as |
| an attribute. |
| |
| >>> df.numeric1.describe() |
| count 3.0 |
| mean 2.0 |
| std 1.0 |
| min 1.0 |
| 25% 1.0 |
| 50% 2.0 |
| 75% 3.0 |
| max 3.0 |
| Name: numeric1, dtype: float64 |
| |
| Describing a column from a ``DataFrame`` by accessing it as |
| an attribute and selecting custom percentiles. |
| |
| >>> df.numeric1.describe(percentiles = [0.85, 0.15]) |
| count 3.0 |
| mean 2.0 |
| std 1.0 |
| min 1.0 |
| 15% 1.0 |
| 50% 2.0 |
| 85% 3.0 |
| max 3.0 |
| Name: numeric1, dtype: float64 |
| """ |
| exprs = [] |
| column_labels = [] |
| for label in self._internal.column_labels: |
| scol = self._internal.spark_column_for(label) |
| spark_type = self._internal.spark_type_for(label) |
| # TODO(SPARK-36350): Make this work with DataTypeOps. |
| if isinstance(spark_type, (FloatType, DoubleType)): |
| exprs.append( |
| F.nanvl(scol, SF.lit(None)).alias(self._internal.spark_column_name_for(label)) |
| ) |
| column_labels.append(label) |
| elif isinstance(spark_type, NumericType): |
| exprs.append(scol) |
| column_labels.append(label) |
| |
| if len(exprs) == 0: |
| raise ValueError("Cannot describe a DataFrame without columns") |
| |
| if percentiles is not None: |
| if any((p < 0.0) or (p > 1.0) for p in percentiles): |
| raise ValueError("Percentiles should all be in the interval [0, 1]") |
| # appending 50% if not in percentiles already |
| percentiles = (percentiles + [0.5]) if 0.5 not in percentiles else percentiles |
| else: |
| percentiles = [0.25, 0.5, 0.75] |
| |
| formatted_perc = ["{:.0%}".format(p) for p in sorted(percentiles)] |
| stats = ["count", "mean", "stddev", "min", *formatted_perc, "max"] |
| |
| sdf = self._internal.spark_frame.select(*exprs).summary(*stats) |
| sdf = sdf.replace("stddev", "std", subset=["summary"]) |
| |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, "summary")], |
| column_labels=column_labels, |
| data_spark_columns=[ |
| scol_for(sdf, self._internal.spark_column_name_for(label)) |
| for label in column_labels |
| ], |
| ) |
| return DataFrame(internal).astype("float64") |
| |
| def drop_duplicates( |
| self, |
| subset: Optional[Union[Name, List[Name]]] = None, |
| keep: str = "first", |
| inplace: bool = False, |
| ) -> Optional["DataFrame"]: |
| """ |
| Return DataFrame with duplicate rows removed, optionally only |
| considering certain columns. |
| |
| Parameters |
| ---------- |
| subset : column label or sequence of labels, optional |
| Only consider certain columns for identifying duplicates, by |
| default use all of the columns. |
| keep : {'first', 'last', False}, default 'first' |
| Determines which duplicates (if any) to keep. |
| - ``first`` : Drop duplicates except for the first occurrence. |
| - ``last`` : Drop duplicates except for the last occurrence. |
| - False : Drop all duplicates. |
| inplace : boolean, default False |
| Whether to drop duplicates in place or to return a copy. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with duplicates removed or None if ``inplace=True``. |
| |
| >>> df = ps.DataFrame( |
| ... {'a': [1, 2, 2, 2, 3], 'b': ['a', 'a', 'a', 'c', 'd']}, columns = ['a', 'b']) |
| >>> df |
| a b |
| 0 1 a |
| 1 2 a |
| 2 2 a |
| 3 2 c |
| 4 3 d |
| |
| >>> df.drop_duplicates().sort_index() |
| a b |
| 0 1 a |
| 1 2 a |
| 3 2 c |
| 4 3 d |
| |
| >>> df.drop_duplicates('a').sort_index() |
| a b |
| 0 1 a |
| 1 2 a |
| 4 3 d |
| |
| >>> df.drop_duplicates(['a', 'b']).sort_index() |
| a b |
| 0 1 a |
| 1 2 a |
| 3 2 c |
| 4 3 d |
| |
| >>> df.drop_duplicates(keep='last').sort_index() |
| a b |
| 0 1 a |
| 2 2 a |
| 3 2 c |
| 4 3 d |
| |
| >>> df.drop_duplicates(keep=False).sort_index() |
| a b |
| 0 1 a |
| 3 2 c |
| 4 3 d |
| """ |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| |
| sdf, column = self._mark_duplicates(subset, keep) |
| |
| sdf = sdf.where(~scol_for(sdf, column)).drop(column) |
| internal = self._internal.with_new_sdf(sdf) |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| |
| def reindex( |
| self, |
| labels: Optional[Sequence[Any]] = None, |
| index: Optional[Union["Index", Sequence[Any]]] = None, |
| columns: Optional[Union[pd.Index, Sequence[Any]]] = None, |
| axis: Optional[Axis] = None, |
| copy: Optional[bool] = True, |
| fill_value: Optional[Any] = None, |
| ) -> "DataFrame": |
| """ |
| Conform DataFrame to new index with optional filling logic, placing |
| NA/NaN in locations having no value in the previous index. A new object |
| is produced unless the new index is equivalent to the current one and |
| ``copy=False``. |
| |
| Parameters |
| ---------- |
| labels: array-like, optional |
| New labels / index to conform the axis specified by ‘axis’ to. |
| index, columns: array-like, optional |
| New labels / index to conform to, should be specified using keywords. |
| Preferably an Index object to avoid duplicating data |
| axis: int or str, optional |
| Axis to target. Can be either the axis name (‘index’, ‘columns’) or |
| number (0, 1). |
| copy : bool, default True |
| Return a new object, even if the passed indexes are the same. |
| fill_value : scalar, default np.NaN |
| Value to use for missing values. Defaults to NaN, but can be any |
| "compatible" value. |
| |
| Returns |
| ------- |
| DataFrame with changed index. |
| |
| See Also |
| -------- |
| DataFrame.set_index : Set row labels. |
| DataFrame.reset_index : Remove row labels or move them to new columns. |
| |
| Examples |
| -------- |
| |
| ``DataFrame.reindex`` supports two calling conventions |
| |
| * ``(index=index_labels, columns=column_labels, ...)`` |
| * ``(labels, axis={'index', 'columns'}, ...)`` |
| |
| We *highly* recommend using keyword arguments to clarify your |
| intent. |
| |
| Create a dataframe with some fictional data. |
| |
| >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] |
| >>> df = ps.DataFrame({ |
| ... 'http_status': [200, 200, 404, 404, 301], |
| ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, |
| ... index=index, |
| ... columns=['http_status', 'response_time']) |
| >>> df |
| http_status response_time |
| Firefox 200 0.04 |
| Chrome 200 0.02 |
| Safari 404 0.07 |
| IE10 404 0.08 |
| Konqueror 301 1.00 |
| |
| Create a new index and reindex the dataframe. By default |
| values in the new index that do not have corresponding |
| records in the dataframe are assigned ``NaN``. |
| |
| >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', |
| ... 'Chrome'] |
| >>> df.reindex(new_index).sort_index() |
| http_status response_time |
| Chrome 200.0 0.02 |
| Comodo Dragon NaN NaN |
| IE10 404.0 0.08 |
| Iceweasel NaN NaN |
| Safari 404.0 0.07 |
| |
| We can fill in the missing values by passing a value to |
| the keyword ``fill_value``. |
| |
| >>> df.reindex(new_index, fill_value=0, copy=False).sort_index() |
| http_status response_time |
| Chrome 200 0.02 |
| Comodo Dragon 0 0.00 |
| IE10 404 0.08 |
| Iceweasel 0 0.00 |
| Safari 404 0.07 |
| |
| We can also reindex the columns. |
| |
| >>> df.reindex(columns=['http_status', 'user_agent']).sort_index() |
| http_status user_agent |
| Chrome 200 NaN |
| Firefox 200 NaN |
| IE10 404 NaN |
| Konqueror 301 NaN |
| Safari 404 NaN |
| |
| Or we can use "axis-style" keyword arguments |
| |
| >>> df.reindex(['http_status', 'user_agent'], axis="columns").sort_index() |
| http_status user_agent |
| Chrome 200 NaN |
| Firefox 200 NaN |
| IE10 404 NaN |
| Konqueror 301 NaN |
| Safari 404 NaN |
| |
| To further illustrate the filling functionality in |
| ``reindex``, we will create a dataframe with a |
| monotonically increasing index (for example, a sequence |
| of dates). |
| |
| >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') |
| >>> df2 = ps.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, |
| ... index=date_index) |
| >>> df2.sort_index() |
| prices |
| 2010-01-01 100.0 |
| 2010-01-02 101.0 |
| 2010-01-03 NaN |
| 2010-01-04 100.0 |
| 2010-01-05 89.0 |
| 2010-01-06 88.0 |
| |
| Suppose we decide to expand the dataframe to cover a wider |
| date range. |
| |
| >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D') |
| >>> df2.reindex(date_index2).sort_index() |
| prices |
| 2009-12-29 NaN |
| 2009-12-30 NaN |
| 2009-12-31 NaN |
| 2010-01-01 100.0 |
| 2010-01-02 101.0 |
| 2010-01-03 NaN |
| 2010-01-04 100.0 |
| 2010-01-05 89.0 |
| 2010-01-06 88.0 |
| 2010-01-07 NaN |
| """ |
| if axis is not None and (index is not None or columns is not None): |
| raise TypeError("Cannot specify both 'axis' and any of 'index' or 'columns'.") |
| |
| if labels is not None: |
| axis = validate_axis(axis) |
| if axis == 0: |
| index = labels |
| elif axis == 1: |
| columns = labels |
| else: |
| raise ValueError( |
| "No axis named %s for object type %s." % (axis, type(axis).__name__) |
| ) |
| |
| if index is not None and not is_list_like(index): |
| raise TypeError( |
| "Index must be called with a collection of some kind, " |
| "%s was passed" % type(index) |
| ) |
| |
| if columns is not None and not is_list_like(columns): |
| raise TypeError( |
| "Columns must be called with a collection of some kind, " |
| "%s was passed" % type(columns) |
| ) |
| |
| df = self |
| |
| if index is not None: |
| df = df._reindex_index(index, fill_value) |
| |
| if columns is not None: |
| df = df._reindex_columns(columns, fill_value) |
| |
| # Copy |
| if copy and df is self: |
| return df.copy() |
| else: |
| return df |
| |
| def _reindex_index( |
| self, index: Optional[Union["Index", Sequence[Any]]], fill_value: Optional[Any] |
| ) -> "DataFrame": |
| # When axis is index, we can mimic pandas' by a right outer join. |
| nlevels = self._internal.index_level |
| assert nlevels <= 1 or ( |
| isinstance(index, ps.MultiIndex) and nlevels == index.nlevels |
| ), "MultiIndex DataFrame can only be reindexed with a similar pandas-on-Spark MultiIndex." |
| |
| index_columns = self._internal.index_spark_column_names |
| frame = self._internal.resolved_copy.spark_frame.drop(NATURAL_ORDER_COLUMN_NAME) |
| |
| if isinstance(index, ps.Index): |
| if nlevels != index.nlevels: |
| return DataFrame(index._internal.with_new_columns([])).reindex( |
| columns=self.columns, fill_value=fill_value |
| ) |
| |
| index_names = index._internal.index_names |
| scols = index._internal.index_spark_columns |
| labels = index._internal.spark_frame.select( |
| [scol.alias(index_column) for scol, index_column in zip(scols, index_columns)] |
| ) |
| else: |
| index = ps.Index(list(index)) |
| labels = index._internal.spark_frame.select(index.spark.column.alias(index_columns[0])) |
| index_names = self._internal.index_names |
| |
| if fill_value is not None: |
| frame_index_columns = [ |
| verify_temp_column_name(frame, "__frame_index_column_{}__".format(i)) |
| for i in range(nlevels) |
| ] |
| index_scols = [ |
| scol_for(frame, index_col).alias(frame_index_col) |
| for index_col, frame_index_col in zip(index_columns, frame_index_columns) |
| ] |
| scols = self._internal.resolved_copy.data_spark_columns |
| frame = frame.select(index_scols + scols) |
| |
| temp_fill_value = verify_temp_column_name(frame, "__fill_value__") |
| labels = labels.withColumn(temp_fill_value, SF.lit(fill_value)) |
| |
| frame_index_scols = [scol_for(frame, col) for col in frame_index_columns] |
| labels_index_scols = [scol_for(labels, col) for col in index_columns] |
| |
| joined_df = frame.join( |
| labels, |
| on=[fcol == lcol for fcol, lcol in zip(frame_index_scols, labels_index_scols)], |
| how="right", |
| ) |
| |
| joined_df = joined_df.select( |
| *labels_index_scols, |
| *[ |
| F.when( |
| reduce( |
| lambda c1, c2: c1 & c2, |
| [ |
| fcol.isNull() & lcol.isNotNull() |
| for fcol, lcol in zip(frame_index_scols, labels_index_scols) |
| ], |
| ), |
| scol_for(joined_df, temp_fill_value), |
| ) |
| .otherwise(scol_for(joined_df, col)) |
| .alias(col) |
| for col in self._internal.data_spark_column_names |
| ], |
| ) |
| data_fields = None |
| else: |
| joined_df = frame.join(labels, on=index_columns, how="right") |
| data_fields = [field.copy(nullable=True) for field in self._internal.data_fields] |
| |
| sdf = joined_df.drop(NATURAL_ORDER_COLUMN_NAME) |
| internal = self._internal.copy( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, col) for col in self._internal.index_spark_column_names |
| ], |
| index_names=index_names, |
| index_fields=[ |
| field.copy(name=name) |
| for field, name in zip( |
| index._internal.index_fields, self._internal.index_spark_column_names |
| ) |
| ], |
| data_spark_columns=[ |
| scol_for(sdf, col) for col in self._internal.data_spark_column_names |
| ], |
| data_fields=data_fields, |
| ) |
| return DataFrame(internal) |
| |
| def _reindex_columns( |
| self, columns: Optional[Union[pd.Index, Sequence[Any]]], fill_value: Optional[Any] |
| ) -> "DataFrame": |
| level = self._internal.column_labels_level |
| if level > 1: |
| label_columns = list(columns) |
| for col in label_columns: |
| if not isinstance(col, tuple): |
| raise TypeError("Expected tuple, got {}".format(type(col).__name__)) |
| else: |
| label_columns = [(col,) for col in columns] |
| for col in label_columns: |
| if len(col) != level: |
| raise ValueError( |
| "shape (1,{}) doesn't match the shape (1,{})".format(len(col), level) |
| ) |
| fill_value = np.nan if fill_value is None else fill_value |
| scols_or_pssers = [] # type: List[Union[Series, Column]] |
| labels = [] |
| for label in label_columns: |
| if label in self._internal.column_labels: |
| scols_or_pssers.append(self._psser_for(label)) |
| else: |
| scols_or_pssers.append(SF.lit(fill_value).alias(name_like_string(label))) |
| labels.append(label) |
| |
| if isinstance(columns, pd.Index): |
| column_label_names = [ |
| name if is_name_like_tuple(name) else (name,) for name in columns.names |
| ] |
| internal = self._internal.with_new_columns( |
| scols_or_pssers, column_labels=labels, column_label_names=column_label_names |
| ) |
| else: |
| internal = self._internal.with_new_columns(scols_or_pssers, column_labels=labels) |
| |
| return DataFrame(internal) |
| |
| def reindex_like(self, other: "DataFrame", copy: bool = True) -> "DataFrame": |
| """ |
| Return a DataFrame with matching indices as other object. |
| |
| Conform the object to the same index on all axes. Places NA/NaN in locations |
| having no value in the previous index. A new object is produced unless the |
| new index is equivalent to the current one and copy=False. |
| |
| Parameters |
| ---------- |
| other : DataFrame |
| Its row and column indices are used to define the new indices |
| of this object. |
| copy : bool, default True |
| Return a new object, even if the passed indexes are the same. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame with changed indices on each axis. |
| |
| See Also |
| -------- |
| DataFrame.set_index : Set row labels. |
| DataFrame.reset_index : Remove row labels or move them to new columns. |
| DataFrame.reindex : Change to new indices or expand indices. |
| |
| Notes |
| ----- |
| Same as calling |
| ``.reindex(index=other.index, columns=other.columns,...)``. |
| |
| Examples |
| -------- |
| |
| >>> df1 = ps.DataFrame([[24.3, 75.7, 'high'], |
| ... [31, 87.8, 'high'], |
| ... [22, 71.6, 'medium'], |
| ... [35, 95, 'medium']], |
| ... columns=['temp_celsius', 'temp_fahrenheit', |
| ... 'windspeed'], |
| ... index=pd.date_range(start='2014-02-12', |
| ... end='2014-02-15', freq='D')) |
| >>> df1 |
| temp_celsius temp_fahrenheit windspeed |
| 2014-02-12 24.3 75.7 high |
| 2014-02-13 31.0 87.8 high |
| 2014-02-14 22.0 71.6 medium |
| 2014-02-15 35.0 95.0 medium |
| |
| >>> df2 = ps.DataFrame([[28, 'low'], |
| ... [30, 'low'], |
| ... [35.1, 'medium']], |
| ... columns=['temp_celsius', 'windspeed'], |
| ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', |
| ... '2014-02-15'])) |
| >>> df2 |
| temp_celsius windspeed |
| 2014-02-12 28.0 low |
| 2014-02-13 30.0 low |
| 2014-02-15 35.1 medium |
| |
| >>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE |
| temp_celsius temp_fahrenheit windspeed |
| 2014-02-12 28.0 NaN low |
| 2014-02-13 30.0 NaN low |
| 2014-02-14 NaN NaN None |
| 2014-02-15 35.1 NaN medium |
| """ |
| |
| if isinstance(other, DataFrame): |
| return self.reindex(index=other.index, columns=other.columns, copy=copy) |
| else: |
| raise TypeError("other must be a pandas-on-Spark DataFrame") |
| |
| def melt( |
| self, |
| id_vars: Optional[Union[Name, List[Name]]] = None, |
| value_vars: Optional[Union[Name, List[Name]]] = None, |
| var_name: Optional[Union[str, List[str]]] = None, |
| value_name: str = "value", |
| ) -> "DataFrame": |
| """ |
| Unpivot a DataFrame from wide format to long format, optionally |
| leaving identifier variables set. |
| |
| This function is useful to massage a DataFrame into a format where one |
| or more columns are identifier variables (`id_vars`), while all other |
| columns, considered measured variables (`value_vars`), are "unpivoted" to |
| the row axis, leaving just two non-identifier columns, 'variable' and |
| 'value'. |
| |
| Parameters |
| ---------- |
| frame : DataFrame |
| id_vars : tuple, list, or ndarray, optional |
| Column(s) to use as identifier variables. |
| value_vars : tuple, list, or ndarray, optional |
| Column(s) to unpivot. If not specified, uses all columns that |
| are not set as `id_vars`. |
| var_name : scalar, default 'variable' |
| Name to use for the 'variable' column. If None it uses `frame.columns.name` or |
| ‘variable’. |
| value_name : scalar, default 'value' |
| Name to use for the 'value' column. |
| |
| Returns |
| ------- |
| DataFrame |
| Unpivoted DataFrame. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, |
| ... 'B': {0: 1, 1: 3, 2: 5}, |
| ... 'C': {0: 2, 1: 4, 2: 6}}, |
| ... columns=['A', 'B', 'C']) |
| >>> df |
| A B C |
| 0 a 1 2 |
| 1 b 3 4 |
| 2 c 5 6 |
| |
| >>> ps.melt(df) |
| variable value |
| 0 A a |
| 1 B 1 |
| 2 C 2 |
| 3 A b |
| 4 B 3 |
| 5 C 4 |
| 6 A c |
| 7 B 5 |
| 8 C 6 |
| |
| >>> df.melt(id_vars='A') |
| A variable value |
| 0 a B 1 |
| 1 a C 2 |
| 2 b B 3 |
| 3 b C 4 |
| 4 c B 5 |
| 5 c C 6 |
| |
| >>> df.melt(value_vars='A') |
| variable value |
| 0 A a |
| 1 A b |
| 2 A c |
| |
| >>> ps.melt(df, id_vars=['A', 'B']) |
| A B variable value |
| 0 a 1 C 2 |
| 1 b 3 C 4 |
| 2 c 5 C 6 |
| |
| >>> df.melt(id_vars=['A'], value_vars=['C']) |
| A variable value |
| 0 a C 2 |
| 1 b C 4 |
| 2 c C 6 |
| |
| The names of 'variable' and 'value' columns can be customized: |
| |
| >>> ps.melt(df, id_vars=['A'], value_vars=['B'], |
| ... var_name='myVarname', value_name='myValname') |
| A myVarname myValname |
| 0 a B 1 |
| 1 b B 3 |
| 2 c B 5 |
| """ |
| column_labels = self._internal.column_labels |
| |
| if id_vars is None: |
| id_vars = [] |
| else: |
| if isinstance(id_vars, tuple): |
| if self._internal.column_labels_level == 1: |
| id_vars = [idv if is_name_like_tuple(idv) else (idv,) for idv in id_vars] |
| else: |
| raise ValueError( |
| "id_vars must be a list of tuples" " when columns are a MultiIndex" |
| ) |
| elif is_name_like_value(id_vars): |
| id_vars = [(id_vars,)] |
| else: |
| id_vars = [idv if is_name_like_tuple(idv) else (idv,) for idv in id_vars] |
| |
| non_existence_col = [idv for idv in id_vars if idv not in column_labels] |
| if len(non_existence_col) != 0: |
| raveled_column_labels = np.ravel(column_labels) |
| missing = [ |
| nec for nec in np.ravel(non_existence_col) if nec not in raveled_column_labels |
| ] |
| if len(missing) != 0: |
| raise KeyError( |
| "The following 'id_vars' are not present" |
| " in the DataFrame: {}".format(missing) |
| ) |
| else: |
| raise KeyError( |
| "None of {} are in the {}".format(non_existence_col, column_labels) |
| ) |
| |
| if value_vars is None: |
| value_vars = [] |
| else: |
| if isinstance(value_vars, tuple): |
| if self._internal.column_labels_level == 1: |
| value_vars = [ |
| valv if is_name_like_tuple(valv) else (valv,) for valv in value_vars |
| ] |
| else: |
| raise ValueError( |
| "value_vars must be a list of tuples" " when columns are a MultiIndex" |
| ) |
| elif is_name_like_value(value_vars): |
| value_vars = [(value_vars,)] |
| else: |
| value_vars = [valv if is_name_like_tuple(valv) else (valv,) for valv in value_vars] |
| |
| non_existence_col = [valv for valv in value_vars if valv not in column_labels] |
| if len(non_existence_col) != 0: |
| raveled_column_labels = np.ravel(column_labels) |
| missing = [ |
| nec for nec in np.ravel(non_existence_col) if nec not in raveled_column_labels |
| ] |
| if len(missing) != 0: |
| raise KeyError( |
| "The following 'value_vars' are not present" |
| " in the DataFrame: {}".format(missing) |
| ) |
| else: |
| raise KeyError( |
| "None of {} are in the {}".format(non_existence_col, column_labels) |
| ) |
| |
| if len(value_vars) == 0: |
| value_vars = column_labels |
| |
| column_labels = [label for label in column_labels if label not in id_vars] |
| |
| sdf = self._internal.spark_frame |
| |
| if var_name is None: |
| if ( |
| self._internal.column_labels_level == 1 |
| and self._internal.column_label_names[0] is None |
| ): |
| var_name = ["variable"] |
| else: |
| var_name = [ |
| name_like_string(name) if name is not None else "variable_{}".format(i) |
| for i, name in enumerate(self._internal.column_label_names) |
| ] |
| elif isinstance(var_name, str): |
| var_name = [var_name] |
| |
| pairs = F.explode( |
| F.array( |
| *[ |
| F.struct( |
| *[SF.lit(c).alias(name) for c, name in zip(label, var_name)], |
| *[self._internal.spark_column_for(label).alias(value_name)], |
| ) |
| for label in column_labels |
| if label in value_vars |
| ] |
| ) |
| ) |
| |
| columns = ( |
| [ |
| self._internal.spark_column_for(label).alias(name_like_string(label)) |
| for label in id_vars |
| ] |
| + [F.col("pairs.`%s`" % name) for name in var_name] |
| + [F.col("pairs.`%s`" % value_name)] |
| ) |
| exploded_df = sdf.withColumn("pairs", pairs).select(columns) |
| |
| return DataFrame( |
| InternalFrame( |
| spark_frame=exploded_df, |
| index_spark_columns=None, |
| column_labels=( |
| [label if len(label) == 1 else (name_like_string(label),) for label in id_vars] |
| + [(name,) for name in var_name] |
| + [(value_name,)] |
| ), |
| ) |
| ) |
| |
| def stack(self) -> DataFrameOrSeries: |
| """ |
| Stack the prescribed level(s) from columns to index. |
| |
| Return a reshaped DataFrame or Series having a multi-level |
| index with one or more new inner-most levels compared to the current |
| DataFrame. The new inner-most levels are created by pivoting the |
| columns of the current dataframe: |
| |
| - if the columns have a single level, the output is a Series; |
| - if the columns have multiple levels, the new index |
| level(s) is (are) taken from the prescribed level(s) and |
| the output is a DataFrame. |
| |
| The new index levels are sorted. |
| |
| Returns |
| ------- |
| DataFrame or Series |
| Stacked dataframe or series. |
| |
| See Also |
| -------- |
| DataFrame.unstack : Unstack prescribed level(s) from index axis |
| onto column axis. |
| DataFrame.pivot : Reshape dataframe from long format to wide |
| format. |
| DataFrame.pivot_table : Create a spreadsheet-style pivot table |
| as a DataFrame. |
| |
| Notes |
| ----- |
| The function is named by analogy with a collection of books |
| being reorganized from being side by side on a horizontal |
| position (the columns of the dataframe) to being stacked |
| vertically on top of each other (in the index of the |
| dataframe). |
| |
| Examples |
| -------- |
| **Single level columns** |
| |
| >>> df_single_level_cols = ps.DataFrame([[0, 1], [2, 3]], |
| ... index=['cat', 'dog'], |
| ... columns=['weight', 'height']) |
| |
| Stacking a dataframe with a single level column axis returns a Series: |
| |
| >>> df_single_level_cols |
| weight height |
| cat 0 1 |
| dog 2 3 |
| >>> df_single_level_cols.stack().sort_index() |
| cat height 1 |
| weight 0 |
| dog height 3 |
| weight 2 |
| dtype: int64 |
| |
| **Multi level columns: simple case** |
| |
| >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), |
| ... ('weight', 'pounds')]) |
| >>> df_multi_level_cols1 = ps.DataFrame([[1, 2], [2, 4]], |
| ... index=['cat', 'dog'], |
| ... columns=multicol1) |
| |
| Stacking a dataframe with a multi-level column axis: |
| |
| >>> df_multi_level_cols1 # doctest: +NORMALIZE_WHITESPACE |
| weight |
| kg pounds |
| cat 1 2 |
| dog 2 4 |
| >>> df_multi_level_cols1.stack().sort_index() |
| weight |
| cat kg 1 |
| pounds 2 |
| dog kg 2 |
| pounds 4 |
| |
| **Missing values** |
| |
| >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), |
| ... ('height', 'm')]) |
| >>> df_multi_level_cols2 = ps.DataFrame([[1.0, 2.0], [3.0, 4.0]], |
| ... index=['cat', 'dog'], |
| ... columns=multicol2) |
| |
| It is common to have missing values when stacking a dataframe |
| with multi-level columns, as the stacked dataframe typically |
| has more values than the original dataframe. Missing values |
| are filled with NaNs: |
| |
| >>> df_multi_level_cols2 |
| weight height |
| kg m |
| cat 1.0 2.0 |
| dog 3.0 4.0 |
| >>> df_multi_level_cols2.stack().sort_index() # doctest: +SKIP |
| height weight |
| cat kg NaN 1.0 |
| m 2.0 NaN |
| dog kg NaN 3.0 |
| m 4.0 NaN |
| """ |
| from pyspark.pandas.series import first_series |
| |
| if len(self._internal.column_labels) == 0: |
| return DataFrame( |
| self._internal.copy( |
| column_label_names=self._internal.column_label_names[:-1] |
| ).with_filter(SF.lit(False)) |
| ) |
| |
| column_labels = defaultdict(dict) # type: Union[defaultdict, OrderedDict] |
| index_values = set() |
| should_returns_series = False |
| for label in self._internal.column_labels: |
| new_label = label[:-1] |
| if len(new_label) == 0: |
| new_label = None |
| should_returns_series = True |
| value = label[-1] |
| |
| scol = self._internal.spark_column_for(label) |
| column_labels[new_label][value] = scol |
| |
| index_values.add(value) |
| |
| column_labels = OrderedDict(sorted(column_labels.items(), key=lambda x: x[0])) |
| |
| index_name = self._internal.column_label_names[-1] |
| column_label_names = self._internal.column_label_names[:-1] |
| if len(column_label_names) == 0: |
| column_label_names = [None] |
| |
| index_column = SPARK_INDEX_NAME_FORMAT(self._internal.index_level) |
| data_columns = [name_like_string(label) for label in column_labels] |
| |
| structs = [ |
| F.struct( |
| *[SF.lit(value).alias(index_column)], |
| *[ |
| ( |
| column_labels[label][value] |
| if value in column_labels[label] |
| else SF.lit(None) |
| ).alias(name) |
| for label, name in zip(column_labels, data_columns) |
| ], |
| ).alias(value) |
| for value in index_values |
| ] |
| |
| pairs = F.explode(F.array(*structs)) |
| |
| sdf = self._internal.spark_frame.withColumn("pairs", pairs) |
| sdf = sdf.select( |
| self._internal.index_spark_columns |
| + [sdf["pairs"][index_column].alias(index_column)] |
| + [sdf["pairs"][name].alias(name) for name in data_columns] |
| ) |
| |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, col) |
| for col in (self._internal.index_spark_column_names + [index_column]) |
| ], |
| index_names=self._internal.index_names + [index_name], |
| index_fields=self._internal.index_fields + [None], |
| column_labels=list(column_labels), |
| data_spark_columns=[scol_for(sdf, col) for col in data_columns], |
| column_label_names=column_label_names, |
| ) |
| psdf = DataFrame(internal) # type: "DataFrame" |
| |
| if should_returns_series: |
| return first_series(psdf) |
| else: |
| return psdf |
| |
| def unstack(self) -> DataFrameOrSeries: |
| """ |
| Pivot the (necessarily hierarchical) index labels. |
| |
| Returns a DataFrame having a new level of column labels whose inner-most level |
| consists of the pivoted index labels. |
| |
| If the index is not a MultiIndex, the output will be a Series. |
| |
| .. note:: If the index is a MultiIndex, the output DataFrame could be very wide, and |
| it could cause a serious performance degradation since Spark partitions it row based. |
| |
| Returns |
| ------- |
| Series or DataFrame |
| |
| See Also |
| -------- |
| DataFrame.pivot : Pivot a table based on column values. |
| DataFrame.stack : Pivot a level of the column labels (inverse operation from unstack). |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({"A": {"0": "a", "1": "b", "2": "c"}, |
| ... "B": {"0": "1", "1": "3", "2": "5"}, |
| ... "C": {"0": "2", "1": "4", "2": "6"}}, |
| ... columns=["A", "B", "C"]) |
| >>> df |
| A B C |
| 0 a 1 2 |
| 1 b 3 4 |
| 2 c 5 6 |
| |
| >>> df.unstack().sort_index() |
| A 0 a |
| 1 b |
| 2 c |
| B 0 1 |
| 1 3 |
| 2 5 |
| C 0 2 |
| 1 4 |
| 2 6 |
| dtype: object |
| |
| >>> df.columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')]) |
| >>> df.unstack().sort_index() |
| X A 0 a |
| 1 b |
| 2 c |
| B 0 1 |
| 1 3 |
| 2 5 |
| Y C 0 2 |
| 1 4 |
| 2 6 |
| dtype: object |
| |
| For MultiIndex case: |
| |
| >>> df = ps.DataFrame({"A": ["a", "b", "c"], |
| ... "B": [1, 3, 5], |
| ... "C": [2, 4, 6]}, |
| ... columns=["A", "B", "C"]) |
| >>> df = df.set_index('A', append=True) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| B C |
| A |
| 0 a 1 2 |
| 1 b 3 4 |
| 2 c 5 6 |
| >>> df.unstack().sort_index() # doctest: +NORMALIZE_WHITESPACE |
| B C |
| A a b c a b c |
| 0 1.0 NaN NaN 2.0 NaN NaN |
| 1 NaN 3.0 NaN NaN 4.0 NaN |
| 2 NaN NaN 5.0 NaN NaN 6.0 |
| """ |
| from pyspark.pandas.series import first_series |
| |
| if self._internal.index_level > 1: |
| # The index after `reset_index()` will never be used, so use "distributed" index |
| # as a dummy to avoid overhead. |
| with option_context("compute.default_index_type", "distributed"): |
| df = self.reset_index() |
| index = df._internal.column_labels[: self._internal.index_level - 1] |
| columns = df.columns[self._internal.index_level - 1] |
| df = df.pivot_table( |
| index=index, columns=columns, values=self._internal.column_labels, aggfunc="first" |
| ) |
| internal = df._internal.copy( |
| index_names=self._internal.index_names[:-1], |
| index_fields=df._internal.index_fields[: self._internal.index_level - 1], |
| column_label_names=( |
| df._internal.column_label_names[:-1] |
| + [ |
| None |
| if self._internal.index_names[-1] is None |
| else df._internal.column_label_names[-1] |
| ] |
| ), |
| ) |
| return DataFrame(internal) |
| |
| # TODO: Codes here are similar with melt. Should we deduplicate? |
| column_labels = self._internal.column_labels |
| ser_name = SPARK_DEFAULT_SERIES_NAME |
| sdf = self._internal.spark_frame |
| new_index_columns = [ |
| SPARK_INDEX_NAME_FORMAT(i) for i in range(self._internal.column_labels_level) |
| ] |
| |
| new_index_map = list(zip_longest(new_index_columns, self._internal.column_label_names, [])) |
| |
| pairs = F.explode( |
| F.array( |
| *[ |
| F.struct( |
| *[SF.lit(c).alias(name) for c, name in zip(idx, new_index_columns)], |
| *[self._internal.spark_column_for(idx).alias(ser_name)], |
| ) |
| for idx in column_labels |
| ] |
| ) |
| ) |
| |
| columns = [ |
| F.col("pairs.%s" % name) |
| for name in new_index_columns[: self._internal.column_labels_level] |
| ] + [F.col("pairs.%s" % ser_name)] |
| |
| new_index_len = len(new_index_columns) |
| existing_index_columns = [] |
| for i, (index_name, index_field) in enumerate( |
| zip(self._internal.index_names, self._internal.index_fields) |
| ): |
| name = SPARK_INDEX_NAME_FORMAT(i + new_index_len) |
| new_index_map.append((name, index_name, index_field.copy(name=name))) |
| existing_index_columns.append(self._internal.index_spark_columns[i].alias(name)) |
| |
| exploded_df = sdf.withColumn("pairs", pairs).select(existing_index_columns + columns) |
| |
| index_spark_column_names, index_names, index_fields = zip(*new_index_map) |
| return first_series( |
| DataFrame( |
| InternalFrame( |
| exploded_df, |
| index_spark_columns=[ |
| scol_for(exploded_df, col) for col in index_spark_column_names |
| ], |
| index_names=list(index_names), |
| index_fields=list(index_fields), |
| column_labels=[None], |
| ) |
| ) |
| ) |
| |
| # TODO: axis, skipna, and many arguments should be implemented. |
| def all(self, axis: Axis = 0) -> "Series": |
| """ |
| Return whether all elements are True. |
| |
| Returns True unless there is at least one element within a series that is |
| False or equivalent (e.g. zero or empty) |
| |
| Parameters |
| ---------- |
| axis : {0 or 'index'}, default 0 |
| Indicate which axis or axes should be reduced. |
| |
| * 0 / 'index' : reduce the index, return a Series whose index is the |
| original column labels. |
| |
| Returns |
| ------- |
| Series |
| |
| Examples |
| -------- |
| Create a dataframe from a dictionary. |
| |
| >>> df = ps.DataFrame({ |
| ... 'col1': [True, True, True], |
| ... 'col2': [True, False, False], |
| ... 'col3': [0, 0, 0], |
| ... 'col4': [1, 2, 3], |
| ... 'col5': [True, True, None], |
| ... 'col6': [True, False, None]}, |
| ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6']) |
| |
| Default behaviour checks if column-wise values all return a boolean. |
| |
| >>> df.all() |
| col1 True |
| col2 False |
| col3 False |
| col4 True |
| col5 True |
| col6 False |
| dtype: bool |
| """ |
| from pyspark.pandas.series import first_series |
| |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| applied = [] |
| column_labels = self._internal.column_labels |
| for label in column_labels: |
| scol = self._internal.spark_column_for(label) |
| all_col = F.min(F.coalesce(scol.cast("boolean"), SF.lit(True))) |
| applied.append(F.when(all_col.isNull(), True).otherwise(all_col)) |
| |
| # TODO: there is a similar logic to transpose in, for instance, |
| # DataFrame.any, Series.quantile. Maybe we should deduplicate it. |
| value_column = "value" |
| cols = [] |
| for label, applied_col in zip(column_labels, applied): |
| cols.append( |
| F.struct( |
| *[SF.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)], |
| *[applied_col.alias(value_column)], |
| ) |
| ) |
| |
| sdf = self._internal.spark_frame.select(F.array(*cols).alias("arrays")).select( |
| F.explode(F.col("arrays")) |
| ) |
| sdf = sdf.selectExpr("col.*") |
| |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) |
| for i in range(self._internal.column_labels_level) |
| ], |
| index_names=self._internal.column_label_names, |
| column_labels=[None], |
| data_spark_columns=[scol_for(sdf, value_column)], |
| ) |
| |
| return first_series(DataFrame(internal)) |
| |
| # TODO: axis, skipna, and many arguments should be implemented. |
| def any(self, axis: Axis = 0) -> "Series": |
| """ |
| Return whether any element is True. |
| |
| Returns False unless there is at least one element within a series that is |
| True or equivalent (e.g. non-zero or non-empty). |
| |
| Parameters |
| ---------- |
| axis : {0 or 'index'}, default 0 |
| Indicate which axis or axes should be reduced. |
| |
| * 0 / 'index' : reduce the index, return a Series whose index is the |
| original column labels. |
| |
| Returns |
| ------- |
| Series |
| |
| Examples |
| -------- |
| Create a dataframe from a dictionary. |
| |
| >>> df = ps.DataFrame({ |
| ... 'col1': [False, False, False], |
| ... 'col2': [True, False, False], |
| ... 'col3': [0, 0, 1], |
| ... 'col4': [0, 1, 2], |
| ... 'col5': [False, False, None], |
| ... 'col6': [True, False, None]}, |
| ... columns=['col1', 'col2', 'col3', 'col4', 'col5', 'col6']) |
| |
| Default behaviour checks if column-wise values all return a boolean. |
| |
| >>> df.any() |
| col1 False |
| col2 True |
| col3 True |
| col4 True |
| col5 False |
| col6 True |
| dtype: bool |
| """ |
| from pyspark.pandas.series import first_series |
| |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| applied = [] |
| column_labels = self._internal.column_labels |
| for label in column_labels: |
| scol = self._internal.spark_column_for(label) |
| all_col = F.max(F.coalesce(scol.cast("boolean"), SF.lit(False))) |
| applied.append(F.when(all_col.isNull(), False).otherwise(all_col)) |
| |
| # TODO: there is a similar logic to transpose in, for instance, |
| # DataFrame.all, Series.quantile. Maybe we should deduplicate it. |
| value_column = "value" |
| cols = [] |
| for label, applied_col in zip(column_labels, applied): |
| cols.append( |
| F.struct( |
| *[SF.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in enumerate(label)], |
| *[applied_col.alias(value_column)], |
| ) |
| ) |
| |
| sdf = self._internal.spark_frame.select(F.array(*cols).alias("arrays")).select( |
| F.explode(F.col("arrays")) |
| ) |
| sdf = sdf.selectExpr("col.*") |
| |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[ |
| scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) |
| for i in range(self._internal.column_labels_level) |
| ], |
| index_names=self._internal.column_label_names, |
| column_labels=[None], |
| data_spark_columns=[scol_for(sdf, value_column)], |
| ) |
| |
| return first_series(DataFrame(internal)) |
| |
| # TODO: add axis, numeric_only, pct, na_option parameter |
| def rank(self, method: str = "average", ascending: bool = True) -> "DataFrame": |
| """ |
| Compute numerical data ranks (1 through n) along axis. Equal values are |
| assigned a rank that is the average of the ranks of those values. |
| |
| .. note:: the current implementation of rank uses Spark's Window without |
| specifying partition specification. This leads to move all data into |
| single partition in single machine and could cause serious |
| performance degradation. Avoid this method against very large dataset. |
| |
| Parameters |
| ---------- |
| method : {'average', 'min', 'max', 'first', 'dense'} |
| * average: average rank of group |
| * min: lowest rank in group |
| * max: highest rank in group |
| * first: ranks assigned in order they appear in the array |
| * dense: like 'min', but rank always increases by 1 between groups |
| ascending : boolean, default True |
| False for ranks by high (1) to low (N) |
| |
| Returns |
| ------- |
| ranks : same type as caller |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 3, 2, 1]}, columns= ['A', 'B']) |
| >>> df |
| A B |
| 0 1 4 |
| 1 2 3 |
| 2 2 2 |
| 3 3 1 |
| |
| >>> df.rank().sort_index() |
| A B |
| 0 1.0 4.0 |
| 1 2.5 3.0 |
| 2 2.5 2.0 |
| 3 4.0 1.0 |
| |
| If method is set to 'min', it use lowest rank in group. |
| |
| >>> df.rank(method='min').sort_index() |
| A B |
| 0 1.0 4.0 |
| 1 2.0 3.0 |
| 2 2.0 2.0 |
| 3 4.0 1.0 |
| |
| If method is set to 'max', it use highest rank in group. |
| |
| >>> df.rank(method='max').sort_index() |
| A B |
| 0 1.0 4.0 |
| 1 3.0 3.0 |
| 2 3.0 2.0 |
| 3 4.0 1.0 |
| |
| If method is set to 'dense', it leaves no gaps in group. |
| |
| >>> df.rank(method='dense').sort_index() |
| A B |
| 0 1.0 4.0 |
| 1 2.0 3.0 |
| 2 2.0 2.0 |
| 3 3.0 1.0 |
| """ |
| return self._apply_series_op( |
| lambda psser: psser._rank(method=method, ascending=ascending), should_resolve=True |
| ) |
| |
| def filter( |
| self, |
| items: Optional[Sequence[Any]] = None, |
| like: Optional[str] = None, |
| regex: Optional[str] = None, |
| axis: Optional[Axis] = None, |
| ) -> "DataFrame": |
| """ |
| Subset rows or columns of dataframe according to labels in |
| the specified index. |
| |
| Note that this routine does not filter a dataframe on its |
| contents. The filter is applied to the labels of the index. |
| |
| Parameters |
| ---------- |
| items : list-like |
| Keep labels from axis which are in items. |
| like : string |
| Keep labels from axis for which "like in label == True". |
| regex : string (regular expression) |
| Keep labels from axis for which re.search(regex, label) == True. |
| axis : int or string axis name |
| The axis to filter on. By default this is the info axis, |
| 'index' for Series, 'columns' for DataFrame. |
| |
| Returns |
| ------- |
| same type as input object |
| |
| See Also |
| -------- |
| DataFrame.loc |
| |
| Notes |
| ----- |
| The ``items``, ``like``, and ``regex`` parameters are |
| enforced to be mutually exclusive. |
| |
| ``axis`` defaults to the info axis that is used when indexing |
| with ``[]``. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), |
| ... index=['mouse', 'rabbit'], |
| ... columns=['one', 'two', 'three']) |
| |
| >>> # select columns by name |
| >>> df.filter(items=['one', 'three']) |
| one three |
| mouse 1 3 |
| rabbit 4 6 |
| |
| >>> # select columns by regular expression |
| >>> df.filter(regex='e$', axis=1) |
| one three |
| mouse 1 3 |
| rabbit 4 6 |
| |
| >>> # select rows containing 'bbi' |
| >>> df.filter(like='bbi', axis=0) |
| one two three |
| rabbit 4 5 6 |
| |
| For a Series, |
| |
| >>> # select rows by name |
| >>> df.one.filter(items=['rabbit']) |
| rabbit 4 |
| Name: one, dtype: int64 |
| |
| >>> # select rows by regular expression |
| >>> df.one.filter(regex='e$') |
| mouse 1 |
| Name: one, dtype: int64 |
| |
| >>> # select rows containing 'bbi' |
| >>> df.one.filter(like='bbi') |
| rabbit 4 |
| Name: one, dtype: int64 |
| """ |
| if sum(x is not None for x in (items, like, regex)) > 1: |
| raise TypeError( |
| "Keyword arguments `items`, `like`, or `regex` " "are mutually exclusive" |
| ) |
| |
| axis = validate_axis(axis, none_axis=1) |
| |
| index_scols = self._internal.index_spark_columns |
| |
| if items is not None: |
| if is_list_like(items): |
| items = list(items) |
| else: |
| raise ValueError("items should be a list-like object.") |
| if axis == 0: |
| if len(index_scols) == 1: |
| col = None |
| for item in items: |
| if col is None: |
| col = index_scols[0] == SF.lit(item) |
| else: |
| col = col | (index_scols[0] == SF.lit(item)) |
| elif len(index_scols) > 1: |
| # for multi-index |
| col = None |
| for item in items: |
| if not isinstance(item, tuple): |
| raise TypeError("Unsupported type {}".format(type(item).__name__)) |
| if not item: |
| raise ValueError("The item should not be empty.") |
| midx_col = None |
| for i, element in enumerate(item): |
| if midx_col is None: |
| midx_col = index_scols[i] == SF.lit(element) |
| else: |
| midx_col = midx_col & (index_scols[i] == SF.lit(element)) |
| if col is None: |
| col = midx_col |
| else: |
| col = col | midx_col |
| else: |
| raise ValueError("Single or multi index must be specified.") |
| return DataFrame(self._internal.with_filter(col)) |
| else: |
| return self[items] |
| elif like is not None: |
| if axis == 0: |
| col = None |
| for index_scol in index_scols: |
| if col is None: |
| col = index_scol.contains(like) |
| else: |
| col = col | index_scol.contains(like) |
| return DataFrame(self._internal.with_filter(col)) |
| else: |
| column_labels = self._internal.column_labels |
| output_labels = [label for label in column_labels if any(like in i for i in label)] |
| return self[output_labels] |
| elif regex is not None: |
| if axis == 0: |
| col = None |
| for index_scol in index_scols: |
| if col is None: |
| col = index_scol.rlike(regex) |
| else: |
| col = col | index_scol.rlike(regex) |
| return DataFrame(self._internal.with_filter(col)) |
| else: |
| column_labels = self._internal.column_labels |
| matcher = re.compile(regex) |
| output_labels = [ |
| label |
| for label in column_labels |
| if any(matcher.search(i) is not None for i in label) |
| ] |
| return self[output_labels] |
| else: |
| raise TypeError("Must pass either `items`, `like`, or `regex`") |
| |
| def rename( |
| self, |
| mapper: Optional[Union[Dict, Callable[[Any], Any]]] = None, |
| index: Optional[Union[Dict, Callable[[Any], Any]]] = None, |
| columns: Optional[Union[Dict, Callable[[Any], Any]]] = None, |
| axis: Axis = "index", |
| inplace: bool = False, |
| level: Optional[int] = None, |
| errors: str = "ignore", |
| ) -> Optional["DataFrame"]: |
| |
| """ |
| Alter axes labels. |
| Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series |
| will be left as-is. Extra labels listed don’t throw an error. |
| |
| Parameters |
| ---------- |
| mapper : dict-like or function |
| Dict-like or functions transformations to apply to that axis’ values. |
| Use either `mapper` and `axis` to specify the axis to target with `mapper`, or `index` |
| and `columns`. |
| index : dict-like or function |
| Alternative to specifying axis ("mapper, axis=0" is equivalent to "index=mapper"). |
| columns : dict-like or function |
| Alternative to specifying axis ("mapper, axis=1" is equivalent to "columns=mapper"). |
| axis : int or str, default 'index' |
| Axis to target with mapper. Can be either the axis name ('index', 'columns') or |
| number (0, 1). |
| inplace : bool, default False |
| Whether to return a new DataFrame. |
| level : int or level name, default None |
| In case of a MultiIndex, only rename labels in the specified level. |
| errors : {'ignore', 'raise}, default 'ignore' |
| If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, or `columns` |
| contains labels that are not present in the Index being transformed. If 'ignore', |
| existing keys will be renamed and extra keys will be ignored. |
| |
| Returns |
| ------- |
| DataFrame with the renamed axis labels. |
| |
| Raises |
| ------ |
| `KeyError` |
| If any of the labels is not found in the selected axis and "errors='raise'". |
| |
| Examples |
| -------- |
| >>> psdf1 = ps.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) |
| >>> psdf1.rename(columns={"A": "a", "B": "c"}) # doctest: +NORMALIZE_WHITESPACE |
| a c |
| 0 1 4 |
| 1 2 5 |
| 2 3 6 |
| |
| >>> psdf1.rename(index={1: 10, 2: 20}) # doctest: +NORMALIZE_WHITESPACE |
| A B |
| 0 1 4 |
| 10 2 5 |
| 20 3 6 |
| |
| >>> def str_lower(s) -> str: |
| ... return str.lower(s) |
| >>> psdf1.rename(str_lower, axis='columns') # doctest: +NORMALIZE_WHITESPACE |
| a b |
| 0 1 4 |
| 1 2 5 |
| 2 3 6 |
| |
| >>> def mul10(x) -> int: |
| ... return x * 10 |
| >>> psdf1.rename(mul10, axis='index') # doctest: +NORMALIZE_WHITESPACE |
| A B |
| 0 1 4 |
| 10 2 5 |
| 20 3 6 |
| |
| >>> idx = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C'), ('Y', 'D')]) |
| >>> psdf2 = ps.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=idx) |
| >>> psdf2.rename(columns=str_lower, level=0) # doctest: +NORMALIZE_WHITESPACE |
| x y |
| A B C D |
| 0 1 2 3 4 |
| 1 5 6 7 8 |
| |
| >>> psdf3 = ps.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=idx, columns=list('ab')) |
| >>> psdf3.rename(index=str_lower) # doctest: +NORMALIZE_WHITESPACE |
| a b |
| x a 1 2 |
| b 3 4 |
| y c 5 6 |
| d 7 8 |
| """ |
| |
| def gen_mapper_fn( |
| mapper: Union[Dict, Callable[[Any], Any]] |
| ) -> Tuple[Callable[[Any], Any], Dtype, DataType]: |
| if isinstance(mapper, dict): |
| mapper_dict = cast(dict, mapper) |
| if len(mapper_dict) == 0: |
| if errors == "raise": |
| raise KeyError("Index include label which is not in the `mapper`.") |
| else: |
| return DataFrame(self._internal) |
| |
| type_set = set(map(lambda x: type(x), mapper_dict.values())) |
| if len(type_set) > 1: |
| raise ValueError("Mapper dict should have the same value type.") |
| dtype, spark_return_type = pandas_on_spark_type(list(type_set)[0]) |
| |
| def mapper_fn(x: Any) -> Any: |
| if x in mapper_dict: |
| return mapper_dict[x] |
| else: |
| if errors == "raise": |
| raise KeyError("Index include value which is not in the `mapper`") |
| return x |
| |
| elif callable(mapper): |
| mapper_callable = cast(Callable, mapper) |
| return_type = cast(ScalarType, infer_return_type(mapper)) |
| dtype = return_type.dtype |
| spark_return_type = return_type.spark_type |
| |
| def mapper_fn(x: Any) -> Any: |
| return mapper_callable(x) |
| |
| else: |
| raise ValueError( |
| "`mapper` or `index` or `columns` should be " |
| "either dict-like or function type." |
| ) |
| return mapper_fn, dtype, spark_return_type |
| |
| index_mapper_fn = None |
| index_mapper_ret_stype = None |
| columns_mapper_fn = None |
| |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| if mapper: |
| axis = validate_axis(axis) |
| if axis == 0: |
| index_mapper_fn, index_mapper_ret_dtype, index_mapper_ret_stype = gen_mapper_fn( |
| mapper |
| ) |
| elif axis == 1: |
| columns_mapper_fn, _, _ = gen_mapper_fn(mapper) |
| else: |
| raise ValueError( |
| "argument axis should be either the axis name " |
| "(‘index’, ‘columns’) or number (0, 1)" |
| ) |
| else: |
| if index: |
| index_mapper_fn, index_mapper_ret_dtype, index_mapper_ret_stype = gen_mapper_fn( |
| index |
| ) |
| if columns: |
| columns_mapper_fn, _, _ = gen_mapper_fn(columns) |
| |
| if not index and not columns: |
| raise ValueError("Either `index` or `columns` should be provided.") |
| |
| psdf = self.copy() |
| if index_mapper_fn: |
| # rename index labels, if `level` is None, rename all index columns, otherwise only |
| # rename the corresponding level index. |
| # implement this by transform the underlying spark dataframe, |
| # Example: |
| # suppose the psdf index column in underlying spark dataframe is "index_0", "index_1", |
| # if rename level 0 index labels, will do: |
| # ``psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0"))`` |
| # if rename all index labels (`level` is None), then will do: |
| # ``` |
| # psdf._sdf.withColumn("index_0", mapper_fn_udf(col("index_0")) |
| # .withColumn("index_1", mapper_fn_udf(col("index_1")) |
| # ``` |
| |
| index_columns = psdf._internal.index_spark_column_names |
| num_indices = len(index_columns) |
| if level: |
| if level < 0 or level >= num_indices: |
| raise ValueError("level should be an integer between [0, num_indices)") |
| |
| @pandas_udf(returnType=index_mapper_ret_stype) # type: ignore |
| def index_mapper_udf(s: pd.Series) -> pd.Series: |
| return s.map(index_mapper_fn) |
| |
| index_spark_columns = psdf._internal.index_spark_columns.copy() |
| index_fields = psdf._internal.index_fields.copy() |
| if level is None: |
| for i in range(num_indices): |
| index_spark_columns[i] = index_mapper_udf(index_spark_columns[i]).alias( |
| index_columns[i] |
| ) |
| index_fields[i] = index_fields[i].copy( |
| dtype=index_mapper_ret_dtype, |
| spark_type=index_mapper_ret_stype, |
| nullable=True, |
| ) |
| else: |
| index_spark_columns[level] = index_mapper_udf(index_spark_columns[level]).alias( |
| index_columns[level] |
| ) |
| index_fields[level] = index_fields[level].copy( |
| dtype=index_mapper_ret_dtype, |
| spark_type=index_mapper_ret_stype, |
| nullable=True, |
| ) |
| psdf = DataFrame( |
| psdf._internal.copy( |
| index_spark_columns=index_spark_columns, index_fields=index_fields |
| ) |
| ) |
| if columns_mapper_fn: |
| # rename column name. |
| # Will modify the `_internal._column_labels` and transform underlying spark dataframe |
| # to the same column name with `_internal._column_labels`. |
| if level: |
| if level < 0 or level >= psdf._internal.column_labels_level: |
| raise ValueError("level should be an integer between [0, column_labels_level)") |
| |
| def gen_new_column_labels_entry(column_labels_entry: Label) -> Label: |
| if level is None: |
| # rename all level columns |
| return tuple(map(columns_mapper_fn, column_labels_entry)) |
| else: |
| # only rename specified level column |
| entry_list = list(column_labels_entry) |
| entry_list[level] = columns_mapper_fn(entry_list[level]) |
| return tuple(entry_list) |
| |
| new_column_labels = list(map(gen_new_column_labels_entry, psdf._internal.column_labels)) |
| |
| new_data_pssers = [ |
| psdf._psser_for(old_label).rename(new_label) |
| for old_label, new_label in zip(psdf._internal.column_labels, new_column_labels) |
| ] |
| psdf = DataFrame(psdf._internal.with_new_columns(new_data_pssers)) |
| if inplace: |
| self._update_internal_frame(psdf._internal) |
| return None |
| else: |
| return psdf |
| |
| def rename_axis( |
| self, |
| mapper: Union[Any, Sequence[Any], Dict[Name, Any], Callable[[Name], Any]] = None, |
| index: Union[Any, Sequence[Any], Dict[Name, Any], Callable[[Name], Any]] = None, |
| columns: Union[Any, Sequence[Any], Dict[Name, Any], Callable[[Name], Any]] = None, |
| axis: Optional[Axis] = 0, |
| inplace: Optional[bool] = False, |
| ) -> Optional["DataFrame"]: |
| """ |
| Set the name of the axis for the index or columns. |
| |
| Parameters |
| ---------- |
| mapper : scalar, list-like, optional |
| A scalar, list-like, dict-like or functions transformations to |
| apply to the axis name attribute. |
| index, columns : scalar, list-like, dict-like or function, optional |
| A scalar, list-like, dict-like or functions transformations to |
| apply to that axis' values. |
| |
| Use either ``mapper`` and ``axis`` to |
| specify the axis to target with ``mapper``, or ``index`` |
| and/or ``columns``. |
| axis : {0 or 'index', 1 or 'columns'}, default 0 |
| The axis to rename. |
| inplace : bool, default False |
| Modifies the object directly, instead of creating a new DataFrame. |
| |
| Returns |
| ------- |
| DataFrame, or None if `inplace` is True. |
| |
| See Also |
| -------- |
| Series.rename : Alter Series index labels or name. |
| DataFrame.rename : Alter DataFrame index labels or name. |
| Index.rename : Set new names on index. |
| |
| Notes |
| ----- |
| ``DataFrame.rename_axis`` supports two calling conventions |
| |
| * ``(index=index_mapper, columns=columns_mapper, ...)`` |
| * ``(mapper, axis={'index', 'columns'}, ...)`` |
| |
| The first calling convention will only modify the names of |
| the index and/or the names of the Index object that is the columns. |
| |
| The second calling convention will modify the names of the |
| corresponding index specified by axis. |
| |
| We *highly* recommend using keyword arguments to clarify your |
| intent. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({"num_legs": [4, 4, 2], |
| ... "num_arms": [0, 0, 2]}, |
| ... index=["dog", "cat", "monkey"], |
| ... columns=["num_legs", "num_arms"]) |
| >>> df |
| num_legs num_arms |
| dog 4 0 |
| cat 4 0 |
| monkey 2 2 |
| |
| >>> df = df.rename_axis("animal").sort_index() |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_arms |
| animal |
| cat 4 0 |
| dog 4 0 |
| monkey 2 2 |
| |
| >>> df = df.rename_axis("limbs", axis="columns").sort_index() |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| limbs num_legs num_arms |
| animal |
| cat 4 0 |
| dog 4 0 |
| monkey 2 2 |
| |
| **MultiIndex** |
| |
| >>> index = pd.MultiIndex.from_product([['mammal'], |
| ... ['dog', 'cat', 'monkey']], |
| ... names=['type', 'name']) |
| >>> df = ps.DataFrame({"num_legs": [4, 4, 2], |
| ... "num_arms": [0, 0, 2]}, |
| ... index=index, |
| ... columns=["num_legs", "num_arms"]) |
| >>> df # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_arms |
| type name |
| mammal dog 4 0 |
| cat 4 0 |
| monkey 2 2 |
| |
| >>> df.rename_axis(index={'type': 'class'}).sort_index() # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_arms |
| class name |
| mammal cat 4 0 |
| dog 4 0 |
| monkey 2 2 |
| |
| >>> df.rename_axis(index=str.upper).sort_index() # doctest: +NORMALIZE_WHITESPACE |
| num_legs num_arms |
| TYPE NAME |
| mammal cat 4 0 |
| dog 4 0 |
| monkey 2 2 |
| """ |
| |
| def gen_names( |
| v: Union[Any, Sequence[Any], Dict[Name, Any], Callable[[Name], Any]], |
| curnames: List[Name], |
| ) -> List[Label]: |
| if is_scalar(v): |
| newnames = [cast(Any, v)] # type: List[Name] |
| elif is_list_like(v) and not is_dict_like(v): |
| newnames = list(cast(Sequence[Any], v)) |
| elif is_dict_like(v): |
| v_dict = cast(Dict[Name, Any], v) |
| newnames = [v_dict[name] if name in v_dict else name for name in curnames] |
| elif callable(v): |
| v_callable = cast(Callable[[Name], Any], v) |
| newnames = [v_callable(name) for name in curnames] |
| else: |
| raise ValueError( |
| "`mapper` or `index` or `columns` should be " |
| "either dict-like or function type." |
| ) |
| |
| if len(newnames) != len(curnames): |
| raise ValueError( |
| "Length of new names must be {}, got {}".format(len(curnames), len(newnames)) |
| ) |
| |
| return [name if is_name_like_tuple(name) else (name,) for name in newnames] |
| |
| if mapper is not None and (index is not None or columns is not None): |
| raise TypeError("Cannot specify both 'mapper' and any of 'index' or 'columns'.") |
| |
| if mapper is not None: |
| axis = validate_axis(axis) |
| if axis == 0: |
| index = mapper |
| elif axis == 1: |
| columns = mapper |
| |
| column_label_names = ( |
| gen_names(columns, self.columns.names) |
| if columns is not None |
| else self._internal.column_label_names |
| ) |
| index_names = ( |
| gen_names(index, self.index.names) if index is not None else self._internal.index_names |
| ) |
| |
| internal = self._internal.copy( |
| index_names=index_names, column_label_names=column_label_names |
| ) |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| |
| def keys(self) -> pd.Index: |
| """ |
| Return alias for columns. |
| |
| Returns |
| ------- |
| Index |
| Columns of the DataFrame. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([[1, 2], [4, 5], [7, 8]], |
| ... index=['cobra', 'viper', 'sidewinder'], |
| ... columns=['max_speed', 'shield']) |
| >>> df |
| max_speed shield |
| cobra 1 2 |
| viper 4 5 |
| sidewinder 7 8 |
| |
| >>> df.keys() |
| Index(['max_speed', 'shield'], dtype='object') |
| """ |
| return self.columns |
| |
| def pct_change(self, periods: int = 1) -> "DataFrame": |
| """ |
| Percentage change between the current and a prior element. |
| |
| .. note:: the current implementation of this API uses Spark's Window without |
| specifying partition specification. This leads to move all data into |
| single partition in single machine and could cause serious |
| performance degradation. Avoid this method against very large dataset. |
| |
| Parameters |
| ---------- |
| periods : int, default 1 |
| Periods to shift for forming percent change. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| Examples |
| -------- |
| Percentage change in French franc, Deutsche Mark, and Italian lira |
| from 1980-01-01 to 1980-03-01. |
| |
| >>> df = ps.DataFrame({ |
| ... 'FR': [4.0405, 4.0963, 4.3149], |
| ... 'GR': [1.7246, 1.7482, 1.8519], |
| ... 'IT': [804.74, 810.01, 860.13]}, |
| ... index=['1980-01-01', '1980-02-01', '1980-03-01']) |
| >>> df |
| FR GR IT |
| 1980-01-01 4.0405 1.7246 804.74 |
| 1980-02-01 4.0963 1.7482 810.01 |
| 1980-03-01 4.3149 1.8519 860.13 |
| |
| >>> df.pct_change() |
| FR GR IT |
| 1980-01-01 NaN NaN NaN |
| 1980-02-01 0.013810 0.013684 0.006549 |
| 1980-03-01 0.053365 0.059318 0.061876 |
| |
| You can set periods to shift for forming percent change |
| |
| >>> df.pct_change(2) |
| FR GR IT |
| 1980-01-01 NaN NaN NaN |
| 1980-02-01 NaN NaN NaN |
| 1980-03-01 0.067912 0.073814 0.06883 |
| """ |
| window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-periods, -periods) |
| |
| def op(psser: ps.Series) -> Column: |
| prev_row = F.lag(psser.spark.column, periods).over(window) |
| return ((psser.spark.column - prev_row) / prev_row).alias( |
| psser._internal.data_spark_column_names[0] |
| ) |
| |
| return self._apply_series_op(op, should_resolve=True) |
| |
| # TODO: axis = 1 |
| def idxmax(self, axis: Axis = 0) -> "Series": |
| """ |
| Return index of first occurrence of maximum over requested axis. |
| NA/null values are excluded. |
| |
| .. note:: This API collect all rows with maximum value using `to_pandas()` |
| because we suppose the number of rows with max values are usually small in general. |
| |
| Parameters |
| ---------- |
| axis : 0 or 'index' |
| Can only be set to 0 at the moment. |
| |
| Returns |
| ------- |
| Series |
| |
| See Also |
| -------- |
| Series.idxmax |
| |
| Examples |
| -------- |
| >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], |
| ... 'b': [4.0, 2.0, 3.0, 1.0], |
| ... 'c': [300, 200, 400, 200]}) |
| >>> psdf |
| a b c |
| 0 1 4.0 300 |
| 1 2 2.0 200 |
| 2 3 3.0 400 |
| 3 2 1.0 200 |
| |
| >>> psdf.idxmax() |
| a 2 |
| b 0 |
| c 2 |
| dtype: int64 |
| |
| For Multi-column Index |
| |
| >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], |
| ... 'b': [4.0, 2.0, 3.0, 1.0], |
| ... 'c': [300, 200, 400, 200]}) |
| >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) |
| >>> psdf |
| a b c |
| x y z |
| 0 1 4.0 300 |
| 1 2 2.0 200 |
| 2 3 3.0 400 |
| 3 2 1.0 200 |
| |
| >>> psdf.idxmax() |
| a x 2 |
| b y 0 |
| c z 2 |
| dtype: int64 |
| """ |
| max_cols = map(lambda scol: F.max(scol), self._internal.data_spark_columns) |
| sdf_max = self._internal.spark_frame.select(*max_cols).head() |
| # `sdf_max` looks like below |
| # +------+------+------+ |
| # |(a, x)|(b, y)|(c, z)| |
| # +------+------+------+ |
| # | 3| 4.0| 400| |
| # +------+------+------+ |
| |
| conds = ( |
| scol == max_val for scol, max_val in zip(self._internal.data_spark_columns, sdf_max) |
| ) |
| cond = reduce(lambda x, y: x | y, conds) |
| |
| psdf = DataFrame(self._internal.with_filter(cond)) # type: "DataFrame" |
| |
| return cast(ps.Series, ps.from_pandas(psdf._to_internal_pandas().idxmax())) |
| |
| # TODO: axis = 1 |
| def idxmin(self, axis: Axis = 0) -> "Series": |
| """ |
| Return index of first occurrence of minimum over requested axis. |
| NA/null values are excluded. |
| |
| .. note:: This API collect all rows with minimum value using `to_pandas()` |
| because we suppose the number of rows with min values are usually small in general. |
| |
| Parameters |
| ---------- |
| axis : 0 or 'index' |
| Can only be set to 0 at the moment. |
| |
| Returns |
| ------- |
| Series |
| |
| See Also |
| -------- |
| Series.idxmin |
| |
| Examples |
| -------- |
| >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], |
| ... 'b': [4.0, 2.0, 3.0, 1.0], |
| ... 'c': [300, 200, 400, 200]}) |
| >>> psdf |
| a b c |
| 0 1 4.0 300 |
| 1 2 2.0 200 |
| 2 3 3.0 400 |
| 3 2 1.0 200 |
| |
| >>> psdf.idxmin() |
| a 0 |
| b 3 |
| c 1 |
| dtype: int64 |
| |
| For Multi-column Index |
| |
| >>> psdf = ps.DataFrame({'a': [1, 2, 3, 2], |
| ... 'b': [4.0, 2.0, 3.0, 1.0], |
| ... 'c': [300, 200, 400, 200]}) |
| >>> psdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) |
| >>> psdf |
| a b c |
| x y z |
| 0 1 4.0 300 |
| 1 2 2.0 200 |
| 2 3 3.0 400 |
| 3 2 1.0 200 |
| |
| >>> psdf.idxmin() |
| a x 0 |
| b y 3 |
| c z 1 |
| dtype: int64 |
| """ |
| min_cols = map(lambda scol: F.min(scol), self._internal.data_spark_columns) |
| sdf_min = self._internal.spark_frame.select(*min_cols).head() |
| |
| conds = ( |
| scol == min_val for scol, min_val in zip(self._internal.data_spark_columns, sdf_min) |
| ) |
| cond = reduce(lambda x, y: x | y, conds) |
| |
| psdf = DataFrame(self._internal.with_filter(cond)) # type: "DataFrame" |
| |
| return cast(ps.Series, ps.from_pandas(psdf._to_internal_pandas().idxmin())) |
| |
| def info( |
| self, |
| verbose: Optional[bool] = None, |
| buf: Optional[IO[str]] = None, |
| max_cols: Optional[int] = None, |
| null_counts: Optional[bool] = None, |
| ) -> None: |
| """ |
| Print a concise summary of a DataFrame. |
| |
| This method prints information about a DataFrame including |
| the index dtype and column dtypes, non-null values and memory usage. |
| |
| Parameters |
| ---------- |
| verbose : bool, optional |
| Whether to print the full summary. |
| buf : writable buffer, defaults to sys.stdout |
| Where to send the output. By default, the output is printed to |
| sys.stdout. Pass a writable buffer if you need to further process |
| the output. |
| max_cols : int, optional |
| When to switch from the verbose to the truncated output. If the |
| DataFrame has more than `max_cols` columns, the truncated output |
| is used. |
| null_counts : bool, optional |
| Whether to show the non-null counts. |
| |
| Returns |
| ------- |
| None |
| This method prints a summary of a DataFrame and returns None. |
| |
| See Also |
| -------- |
| DataFrame.describe: Generate descriptive statistics of DataFrame |
| columns. |
| |
| Examples |
| -------- |
| >>> int_values = [1, 2, 3, 4, 5] |
| >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] |
| >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] |
| >>> df = ps.DataFrame( |
| ... {"int_col": int_values, "text_col": text_values, "float_col": float_values}, |
| ... columns=['int_col', 'text_col', 'float_col']) |
| >>> df |
| int_col text_col float_col |
| 0 1 alpha 0.00 |
| 1 2 beta 0.25 |
| 2 3 gamma 0.50 |
| 3 4 delta 0.75 |
| 4 5 epsilon 1.00 |
| |
| Prints information of all columns: |
| |
| >>> df.info(verbose=True) # doctest: +SKIP |
| <class 'pyspark.pandas.frame.DataFrame'> |
| Index: 5 entries, 0 to 4 |
| Data columns (total 3 columns): |
| # Column Non-Null Count Dtype |
| --- ------ -------------- ----- |
| 0 int_col 5 non-null int64 |
| 1 text_col 5 non-null object |
| 2 float_col 5 non-null float64 |
| dtypes: float64(1), int64(1), object(1) |
| |
| Prints a summary of columns count and its dtypes but not per column |
| information: |
| |
| >>> df.info(verbose=False) # doctest: +SKIP |
| <class 'pyspark.pandas.frame.DataFrame'> |
| Index: 5 entries, 0 to 4 |
| Columns: 3 entries, int_col to float_col |
| dtypes: float64(1), int64(1), object(1) |
| |
| Pipe output of DataFrame.info to buffer instead of sys.stdout, get |
| buffer content and writes to a text file: |
| |
| >>> import io |
| >>> buffer = io.StringIO() |
| >>> df.info(buf=buffer) |
| >>> s = buffer.getvalue() |
| >>> with open('%s/info.txt' % path, "w", |
| ... encoding="utf-8") as f: |
| ... _ = f.write(s) |
| >>> with open('%s/info.txt' % path) as f: |
| ... f.readlines() # doctest: +SKIP |
| ["<class 'pyspark.pandas.frame.DataFrame'>\\n", |
| 'Index: 5 entries, 0 to 4\\n', |
| 'Data columns (total 3 columns):\\n', |
| ' # Column Non-Null Count Dtype \\n', |
| '--- ------ -------------- ----- \\n', |
| ' 0 int_col 5 non-null int64 \\n', |
| ' 1 text_col 5 non-null object \\n', |
| ' 2 float_col 5 non-null float64\\n', |
| 'dtypes: float64(1), int64(1), object(1)'] |
| """ |
| # To avoid pandas' existing config affects pandas-on-Spark. |
| # TODO: should we have corresponding pandas-on-Spark configs? |
| with pd.option_context( |
| "display.max_info_columns", sys.maxsize, "display.max_info_rows", sys.maxsize |
| ): |
| try: |
| # hack to use pandas' info as is. |
| object.__setattr__(self, "_data", self) |
| count_func = self.count |
| self.count = lambda: count_func().to_pandas() # type: ignore |
| return pd.DataFrame.info( |
| self, |
| verbose=verbose, |
| buf=buf, |
| max_cols=max_cols, |
| memory_usage=False, |
| null_counts=null_counts, |
| ) |
| finally: |
| del self._data |
| self.count = count_func # type: ignore |
| |
| # TODO: fix parameter 'axis' and 'numeric_only' to work same as pandas' |
| def quantile( |
| self, |
| q: Union[float, Iterable[float]] = 0.5, |
| axis: Axis = 0, |
| numeric_only: bool = True, |
| accuracy: int = 10000, |
| ) -> DataFrameOrSeries: |
| """ |
| Return value at the given quantile. |
| |
| .. note:: Unlike pandas', the quantile in pandas-on-Spark is an approximated quantile |
| based upon approximate percentile computation because computing quantile across a |
| large dataset is extremely expensive. |
| |
| Parameters |
| ---------- |
| q : float or array-like, default 0.5 (50% quantile) |
| 0 <= q <= 1, the quantile(s) to compute. |
| axis : int or str, default 0 or 'index' |
| Can only be set to 0 at the moment. |
| numeric_only : bool, default True |
| If False, the quantile of datetime and timedelta data will be computed as well. |
| Can only be set to True at the moment. |
| accuracy : int, optional |
| Default accuracy of approximation. Larger value means better accuracy. |
| The relative error can be deduced by 1.0 / accuracy. |
| |
| Returns |
| ------- |
| Series or DataFrame |
| If q is an array, a DataFrame will be returned where the |
| index is q, the columns are the columns of self, and the values are the quantiles. |
| If q is a float, a Series will be returned where the |
| index is the columns of self and the values are the quantiles. |
| |
| Examples |
| -------- |
| >>> psdf = ps.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [6, 7, 8, 9, 0]}) |
| >>> psdf |
| a b |
| 0 1 6 |
| 1 2 7 |
| 2 3 8 |
| 3 4 9 |
| 4 5 0 |
| |
| >>> psdf.quantile(.5) |
| a 3.0 |
| b 7.0 |
| Name: 0.5, dtype: float64 |
| |
| >>> psdf.quantile([.25, .5, .75]) |
| a b |
| 0.25 2.0 6.0 |
| 0.50 3.0 7.0 |
| 0.75 4.0 8.0 |
| """ |
| axis = validate_axis(axis) |
| if axis != 0: |
| raise NotImplementedError('axis should be either 0 or "index" currently.') |
| |
| if not isinstance(accuracy, int): |
| raise TypeError( |
| "accuracy must be an integer; however, got [%s]" % type(accuracy).__name__ |
| ) |
| |
| qq = list(q) if isinstance(q, Iterable) else q # type: Union[float, List[float]] |
| |
| for v in qq if isinstance(qq, list) else [qq]: |
| if not isinstance(v, float): |
| raise TypeError( |
| "q must be a float or an array of floats; however, [%s] found." % type(v) |
| ) |
| if v < 0.0 or v > 1.0: |
| raise ValueError("percentiles should all be in the interval [0, 1].") |
| |
| def quantile(spark_column: Column, spark_type: DataType) -> Column: |
| if isinstance(spark_type, (BooleanType, NumericType)): |
| return F.percentile_approx(spark_column.cast(DoubleType()), qq, accuracy) |
| else: |
| raise TypeError( |
| "Could not convert {} ({}) to numeric".format( |
| spark_type_to_pandas_dtype(spark_type), spark_type.simpleString() |
| ) |
| ) |
| |
| if isinstance(qq, list): |
| # First calculate the percentiles from all columns and map it to each `quantiles` |
| # by creating each entry as a struct. So, it becomes an array of structs as below: |
| # |
| # +-----------------------------------------+ |
| # | arrays| |
| # +-----------------------------------------+ |
| # |[[0.25, 2, 6], [0.5, 3, 7], [0.75, 4, 8]]| |
| # +-----------------------------------------+ |
| |
| percentile_cols = [] |
| percentile_col_names = [] |
| column_labels = [] |
| for label, column in zip( |
| self._internal.column_labels, self._internal.data_spark_column_names |
| ): |
| spark_type = self._internal.spark_type_for(label) |
| |
| is_numeric_or_boolean = isinstance(spark_type, (NumericType, BooleanType)) |
| keep_column = not numeric_only or is_numeric_or_boolean |
| |
| if keep_column: |
| percentile_col = quantile(self._internal.spark_column_for(label), spark_type) |
| percentile_cols.append(percentile_col.alias(column)) |
| percentile_col_names.append(column) |
| column_labels.append(label) |
| |
| if len(percentile_cols) == 0: |
| return DataFrame(index=qq) |
| |
| sdf = self._internal.spark_frame.select(percentile_cols) |
| # Here, after select percentile cols, a spark_frame looks like below: |
| # +---------+---------+ |
| # | a| b| |
| # +---------+---------+ |
| # |[2, 3, 4]|[6, 7, 8]| |
| # +---------+---------+ |
| |
| cols_dict = OrderedDict() # type: OrderedDict |
| for column in percentile_col_names: |
| cols_dict[column] = list() |
| for i in range(len(qq)): |
| cols_dict[column].append(scol_for(sdf, column)[i].alias(column)) |
| |
| internal_index_column = SPARK_DEFAULT_INDEX_NAME |
| cols = [] |
| for i, col in enumerate(zip(*cols_dict.values())): |
| cols.append(F.struct(SF.lit(qq[i]).alias(internal_index_column), *col)) |
| sdf = sdf.select(F.array(*cols).alias("arrays")) |
| |
| # And then, explode it and manually set the index. |
| # +-----------------+---+---+ |
| # |__index_level_0__| a| b| |
| # +-----------------+---+---+ |
| # | 0.25| 2| 6| |
| # | 0.5| 3| 7| |
| # | 0.75| 4| 8| |
| # +-----------------+---+---+ |
| sdf = sdf.select(F.explode(F.col("arrays"))).selectExpr("col.*") |
| |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, internal_index_column)], |
| column_labels=column_labels, |
| data_spark_columns=[scol_for(sdf, col) for col in percentile_col_names], |
| ) |
| return DataFrame(internal) |
| else: |
| return self._reduce_for_stat_function( |
| quantile, name="quantile", numeric_only=numeric_only |
| ).rename(qq) |
| |
| def query(self, expr: str, inplace: bool = False) -> Optional["DataFrame"]: |
| """ |
| Query the columns of a DataFrame with a boolean expression. |
| |
| .. note:: Internal columns that starting with a '__' prefix are able to access, however, |
| they are not supposed to be accessed. |
| |
| .. note:: This API delegates to Spark SQL so the syntax follows Spark SQL. Therefore, the |
| pandas specific syntax such as `@` is not supported. If you want the pandas syntax, |
| you can work around with :meth:`DataFrame.pandas_on_spark.apply_batch`, but you should |
| be aware that `query_func` will be executed at different nodes in a distributed manner. |
| So, for example, to use `@` syntax, make sure the variable is serialized by, for |
| example, putting it within the closure as below. |
| |
| >>> df = ps.DataFrame({'A': range(2000), 'B': range(2000)}) |
| >>> def query_func(pdf): |
| ... num = 1995 |
| ... return pdf.query('A > @num') |
| >>> df.pandas_on_spark.apply_batch(query_func) |
| A B |
| 1996 1996 1996 |
| 1997 1997 1997 |
| 1998 1998 1998 |
| 1999 1999 1999 |
| |
| Parameters |
| ---------- |
| expr : str |
| The query string to evaluate. |
| |
| You can refer to column names that contain spaces by surrounding |
| them in backticks. |
| |
| For example, if one of your columns is called ``a a`` and you want |
| to sum it with ``b``, your query should be ```a a` + b``. |
| |
| inplace : bool |
| Whether the query should modify the data in place or return |
| a modified copy. |
| |
| Returns |
| ------- |
| DataFrame |
| DataFrame resulting from the provided query expression. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': range(1, 6), |
| ... 'B': range(10, 0, -2), |
| ... 'C C': range(10, 5, -1)}) |
| >>> df |
| A B C C |
| 0 1 10 10 |
| 1 2 8 9 |
| 2 3 6 8 |
| 3 4 4 7 |
| 4 5 2 6 |
| |
| >>> df.query('A > B') |
| A B C C |
| 4 5 2 6 |
| |
| The previous expression is equivalent to |
| |
| >>> df[df.A > df.B] |
| A B C C |
| 4 5 2 6 |
| |
| For columns with spaces in their name, you can use backtick quoting. |
| |
| >>> df.query('B == `C C`') |
| A B C C |
| 0 1 10 10 |
| |
| The previous expression is equivalent to |
| |
| >>> df[df.B == df['C C']] |
| A B C C |
| 0 1 10 10 |
| """ |
| if isinstance(self.columns, pd.MultiIndex): |
| raise TypeError("Doesn't support for MultiIndex columns") |
| if not isinstance(expr, str): |
| raise TypeError( |
| "expr must be a string to be evaluated, {} given".format(type(expr).__name__) |
| ) |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| |
| data_columns = [label[0] for label in self._internal.column_labels] |
| sdf = self._internal.spark_frame.select( |
| self._internal.index_spark_columns |
| + [ |
| scol.alias(col) |
| for scol, col in zip(self._internal.data_spark_columns, data_columns) |
| ] |
| ).filter(expr) |
| internal = self._internal.with_new_sdf(sdf, data_columns=data_columns) |
| |
| if inplace: |
| self._update_internal_frame(internal) |
| return None |
| else: |
| return DataFrame(internal) |
| |
| def take(self, indices: List[int], axis: Axis = 0, **kwargs: Any) -> "DataFrame": |
| """ |
| Return the elements in the given *positional* indices along an axis. |
| |
| This means that we are not indexing according to actual values in |
| the index attribute of the object. We are indexing according to the |
| actual position of the element in the object. |
| |
| Parameters |
| ---------- |
| indices : array-like |
| An array of ints indicating which positions to take. |
| axis : {0 or 'index', 1 or 'columns', None}, default 0 |
| The axis on which to select elements. ``0`` means that we are |
| selecting rows, ``1`` means that we are selecting columns. |
| **kwargs |
| For compatibility with :meth:`numpy.take`. Has no effect on the |
| output. |
| |
| Returns |
| ------- |
| taken : same type as caller |
| An array-like containing the elements taken from the object. |
| |
| See Also |
| -------- |
| DataFrame.loc : Select a subset of a DataFrame by labels. |
| DataFrame.iloc : Select a subset of a DataFrame by positions. |
| numpy.take : Take elements from an array along an axis. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame([('falcon', 'bird', 389.0), |
| ... ('parrot', 'bird', 24.0), |
| ... ('lion', 'mammal', 80.5), |
| ... ('monkey', 'mammal', np.nan)], |
| ... columns=['name', 'class', 'max_speed'], |
| ... index=[0, 2, 3, 1]) |
| >>> df |
| name class max_speed |
| 0 falcon bird 389.0 |
| 2 parrot bird 24.0 |
| 3 lion mammal 80.5 |
| 1 monkey mammal NaN |
| |
| Take elements at positions 0 and 3 along the axis 0 (default). |
| |
| Note how the actual indices selected (0 and 1) do not correspond to |
| our selected indices 0 and 3. That's because we are selecting the 0th |
| and 3rd rows, not rows whose indices equal 0 and 3. |
| |
| >>> df.take([0, 3]).sort_index() |
| name class max_speed |
| 0 falcon bird 389.0 |
| 1 monkey mammal NaN |
| |
| Take elements at indices 1 and 2 along the axis 1 (column selection). |
| |
| >>> df.take([1, 2], axis=1) |
| class max_speed |
| 0 bird 389.0 |
| 2 bird 24.0 |
| 3 mammal 80.5 |
| 1 mammal NaN |
| |
| We may take elements using negative integers for positive indices, |
| starting from the end of the object, just like with Python lists. |
| |
| >>> df.take([-1, -2]).sort_index() |
| name class max_speed |
| 1 monkey mammal NaN |
| 3 lion mammal 80.5 |
| """ |
| axis = validate_axis(axis) |
| if not is_list_like(indices) or isinstance(indices, (dict, set)): |
| raise TypeError("`indices` must be a list-like except dict or set") |
| if axis == 0: |
| return cast(DataFrame, self.iloc[indices, :]) |
| else: |
| return cast(DataFrame, self.iloc[:, indices]) |
| |
| def eval(self, expr: str, inplace: bool = False) -> Optional[DataFrameOrSeries]: |
| """ |
| Evaluate a string describing operations on DataFrame columns. |
| |
| Operates on columns only, not specific rows or elements. This allows |
| `eval` to run arbitrary code, which can make you vulnerable to code |
| injection if you pass user input to this function. |
| |
| Parameters |
| ---------- |
| expr : str |
| The expression string to evaluate. |
| inplace : bool, default False |
| If the expression contains an assignment, whether to perform the |
| operation inplace and mutate the existing DataFrame. Otherwise, |
| a new DataFrame is returned. |
| |
| Returns |
| ------- |
| The result of the evaluation. |
| |
| See Also |
| -------- |
| DataFrame.query : Evaluates a boolean expression to query the columns |
| of a frame. |
| DataFrame.assign : Can evaluate an expression or function to create new |
| values for a column. |
| eval : Evaluate a Python expression as a string using various |
| backends. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) |
| >>> df |
| A B |
| 0 1 10 |
| 1 2 8 |
| 2 3 6 |
| 3 4 4 |
| 4 5 2 |
| >>> df.eval('A + B') |
| 0 11 |
| 1 10 |
| 2 9 |
| 3 8 |
| 4 7 |
| dtype: int64 |
| |
| Assignment is allowed though by default the original DataFrame is not |
| modified. |
| |
| >>> df.eval('C = A + B') |
| A B C |
| 0 1 10 11 |
| 1 2 8 10 |
| 2 3 6 9 |
| 3 4 4 8 |
| 4 5 2 7 |
| >>> df |
| A B |
| 0 1 10 |
| 1 2 8 |
| 2 3 6 |
| 3 4 4 |
| 4 5 2 |
| |
| Use ``inplace=True`` to modify the original DataFrame. |
| |
| >>> df.eval('C = A + B', inplace=True) |
| >>> df |
| A B C |
| 0 1 10 11 |
| 1 2 8 10 |
| 2 3 6 9 |
| 3 4 4 8 |
| 4 5 2 7 |
| """ |
| from pyspark.pandas.series import first_series |
| |
| if isinstance(self.columns, pd.MultiIndex): |
| raise TypeError("`eval` is not supported for multi-index columns") |
| inplace = validate_bool_kwarg(inplace, "inplace") |
| should_return_series = False |
| series_name = None |
| should_return_scalar = False |
| |
| # Since `eval_func` doesn't have a type hint, inferring the schema is always preformed |
| # in the `apply_batch`. Hence, the variables `should_return_series`, `series_name`, |
| # and `should_return_scalar` can be updated. |
| @no_type_check |
| def eval_func(pdf): |
| nonlocal should_return_series |
| nonlocal series_name |
| nonlocal should_return_scalar |
| result_inner = pdf.eval(expr, inplace=inplace) |
| if inplace: |
| result_inner = pdf |
| if isinstance(result_inner, pd.Series): |
| should_return_series = True |
| series_name = result_inner.name |
| result_inner = result_inner.to_frame() |
| elif is_scalar(result_inner): |
| should_return_scalar = True |
| result_inner = pd.Series(result_inner).to_frame() |
| return result_inner |
| |
| result = self.pandas_on_spark.apply_batch(eval_func) |
| if inplace: |
| # Here, the result is always a frame because the error is thrown during schema inference |
| # from pandas. |
| self._update_internal_frame(result._internal, requires_same_anchor=False) |
| return None |
| elif should_return_series: |
| return first_series(result).rename(series_name) |
| elif should_return_scalar: |
| return first_series(result)[0] |
| else: |
| # Returns a frame |
| return result |
| |
| def explode(self, column: Name) -> "DataFrame": |
| """ |
| Transform each element of a list-like to a row, replicating index values. |
| |
| Parameters |
| ---------- |
| column : str or tuple |
| Column to explode. |
| |
| Returns |
| ------- |
| DataFrame |
| Exploded lists to rows of the subset columns; |
| index will be duplicated for these rows. |
| |
| See Also |
| -------- |
| DataFrame.unstack : Pivot a level of the (necessarily hierarchical) |
| index labels. |
| DataFrame.melt : Unpivot a DataFrame from wide format to long format. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'A': [[1, 2, 3], [], [3, 4]], 'B': 1}) |
| >>> df |
| A B |
| 0 [1, 2, 3] 1 |
| 1 [] 1 |
| 2 [3, 4] 1 |
| |
| >>> df.explode('A') |
| A B |
| 0 1.0 1 |
| 0 2.0 1 |
| 0 3.0 1 |
| 1 NaN 1 |
| 2 3.0 1 |
| 2 4.0 1 |
| """ |
| from pyspark.pandas.series import Series |
| |
| if not is_name_like_value(column): |
| raise TypeError("column must be a scalar") |
| |
| psdf = DataFrame(self._internal.resolved_copy) # type: "DataFrame" |
| psser = psdf[column] |
| if not isinstance(psser, Series): |
| raise ValueError( |
| "The column %s is not unique. For a multi-index, the label must be a tuple " |
| "with elements corresponding to each level." % name_like_string(column) |
| ) |
| if not isinstance(psser.spark.data_type, ArrayType): |
| return self.copy() |
| |
| sdf = psdf._internal.spark_frame.withColumn( |
| psser._internal.data_spark_column_names[0], F.explode_outer(psser.spark.column) |
| ) |
| |
| data_fields = psdf._internal.data_fields.copy() |
| idx = psdf._internal.column_labels.index(psser._column_label) |
| field = data_fields[idx] |
| spark_type = cast(ArrayType, field.spark_type).elementType |
| dtype = spark_type_to_pandas_dtype(spark_type) |
| data_fields[idx] = field.copy(dtype=dtype, spark_type=spark_type, nullable=True) |
| |
| internal = psdf._internal.with_new_sdf(sdf, data_fields=data_fields) |
| return DataFrame(internal) |
| |
| def mad(self, axis: Axis = 0) -> "Series": |
| """ |
| Return the mean absolute deviation of values. |
| |
| Parameters |
| ---------- |
| axis : {index (0), columns (1)} |
| Axis for the function to be applied on. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'a': [1, 2, 3, np.nan], 'b': [0.1, 0.2, 0.3, np.nan]}, |
| ... columns=['a', 'b']) |
| |
| >>> df.mad() |
| a 0.666667 |
| b 0.066667 |
| dtype: float64 |
| |
| >>> df.mad(axis=1) |
| 0 0.45 |
| 1 0.90 |
| 2 1.35 |
| 3 NaN |
| dtype: float64 |
| """ |
| from pyspark.pandas.series import first_series |
| |
| axis = validate_axis(axis) |
| |
| if axis == 0: |
| |
| def get_spark_column(psdf: DataFrame, label: Label) -> Column: |
| scol = psdf._internal.spark_column_for(label) |
| col_type = psdf._internal.spark_type_for(label) |
| |
| if isinstance(col_type, BooleanType): |
| scol = scol.cast("integer") |
| |
| return scol |
| |
| new_column_labels = [] # type: List[Label] |
| for label in self._internal.column_labels: |
| # Filtering out only columns of numeric and boolean type column. |
| dtype = self._psser_for(label).spark.data_type |
| if isinstance(dtype, (NumericType, BooleanType)): |
| new_column_labels.append(label) |
| |
| new_columns = [ |
| F.avg(get_spark_column(self, label)).alias(name_like_string(label)) |
| for label in new_column_labels |
| ] |
| |
| mean_data = self._internal.spark_frame.select(*new_columns).first() |
| |
| new_columns = [ |
| F.avg( |
| F.abs(get_spark_column(self, label) - mean_data[name_like_string(label)]) |
| ).alias(name_like_string(label)) |
| for label in new_column_labels |
| ] |
| |
| sdf = self._internal.spark_frame.select( |
| *[SF.lit(None).cast(StringType()).alias(SPARK_DEFAULT_INDEX_NAME)], *new_columns |
| ) |
| |
| # The data is expected to be small so it's fine to transpose/use default index. |
| with ps.option_context("compute.max_rows", 1): |
| internal = InternalFrame( |
| spark_frame=sdf, |
| index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], |
| column_labels=new_column_labels, |
| column_label_names=self._internal.column_label_names, |
| ) |
| return first_series(DataFrame(internal).transpose()) |
| |
| else: |
| |
| @pandas_udf(returnType=DoubleType()) # type: ignore |
| def calculate_columns_axis(*cols: pd.Series) -> pd.Series: |
| return pd.concat(cols, axis=1).mad(axis=1) |
| |
| internal = self._internal.copy( |
| column_labels=[None], |
| data_spark_columns=[ |
| calculate_columns_axis(*self._internal.data_spark_columns).alias( |
| SPARK_DEFAULT_SERIES_NAME |
| ) |
| ], |
| data_fields=[None], |
| column_label_names=None, |
| ) |
| return first_series(DataFrame(internal)) |
| |
| def tail(self, n: int = 5) -> "DataFrame": |
| """ |
| Return the last `n` rows. |
| |
| This function returns last `n` rows from the object based on |
| position. It is useful for quickly verifying data, for example, |
| after sorting or appending rows. |
| |
| For negative values of `n`, this function returns all rows except |
| the first `n` rows, equivalent to ``df[n:]``. |
| |
| Parameters |
| ---------- |
| n : int, default 5 |
| Number of rows to select. |
| |
| Returns |
| ------- |
| type of caller |
| The last `n` rows of the caller object. |
| |
| See Also |
| -------- |
| DataFrame.head : The first `n` rows of the caller object. |
| |
| Examples |
| -------- |
| >>> df = ps.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', |
| ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) |
| >>> df |
| animal |
| 0 alligator |
| 1 bee |
| 2 falcon |
| 3 lion |
| 4 monkey |
| 5 parrot |
| 6 shark |
| 7 whale |
| 8 zebra |
| |
| Viewing the last 5 lines |
| |
| >>> df.tail() # doctest: +SKIP |
| animal |
| 4 monkey |
| 5 parrot |
| 6 shark |
| 7 whale |
| 8 zebra |
| |
| Viewing the last `n` lines (three in this case) |
| |
| >>> df.tail(3) # doctest: +SKIP |
| animal |
| 6 shark |
| 7 whale |
| 8 zebra |
| |
| For negative values of `n` |
| |
| >>> df.tail(-3) # doctest: +SKIP |
| animal |
| 3 lion |
| 4 monkey |
| 5 parrot |
| 6 shark |
| 7 whale |
| 8 zebra |
| """ |
| if not isinstance(n, int): |
| raise TypeError("bad operand type for unary -: '{}'".format(type(n).__name__)) |
| if n < 0: |
| n = len(self) + n |
| if n <= 0: |
| return ps.DataFrame(self._internal.with_filter(SF.lit(False))) |
| # Should use `resolved_copy` here for the case like `(psdf + 1).tail()` |
| sdf = self._internal.resolved_copy.spark_frame |
| rows = sdf.tail(n) |
| new_sdf = default_session().createDataFrame(rows, sdf.schema) |
| |
| return DataFrame(self._internal.with_new_sdf(new_sdf)) |
| |
| def align( |
| self, |
| other: DataFrameOrSeries, |
| join: str = "outer", |
| axis: Optional[Axis] = None, |
| copy: bool = True, |
| ) -> Tuple["DataFrame", DataFrameOrSeries]: |
| """ |
| Align two objects on their axes with the specified join method. |
| |
| Join method is specified for each axis Index. |
| |
| Parameters |
| ---------- |
| other : DataFrame or Series |
| join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' |
| axis : allowed axis of the other object, default None |
| Align on index (0), columns (1), or both (None). |
| copy : bool, default True |
| Always returns new objects. If copy=False and no reindexing is |
| required then original objects are returned. |
| |
| Returns |
| ------- |
| (left, right) : (DataFrame, type of other) |
| Aligned objects. |
| |
| Examples |
| -------- |
| >>> ps.set_option("compute.ops_on_diff_frames", True) |
| >>> df1 = ps.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=[10, 20, 30]) |
| >>> df2 = ps.DataFrame({"a": [4, 5, 6], "c": ["d", "e", "f"]}, index=[10, 11, 12]) |
| |
| Align both axis: |
| |
| >>> aligned_l, aligned_r = df1.align(df2) |
| >>> aligned_l.sort_index() |
| a b c |
| 10 1.0 a NaN |
| 11 NaN None NaN |
| 12 NaN None NaN |
| 20 2.0 b NaN |
| 30 3.0 c NaN |
| >>> aligned_r.sort_index() |
| a b c |
| 10 4.0 NaN d |
| 11 5.0 NaN e |
| 12 6.0 NaN f |
| 20 NaN NaN None |
| 30 NaN NaN None |
| |
| Align only axis=0 (index): |
| |
| >>> aligned_l, aligned_r = df1.align(df2, axis=0) |
| >>> aligned_l.sort_index() |
| a b |
| 10 1.0 a |
| 11 NaN None |
| 12 NaN None |
| 20 2.0 b |
| 30 3.0 c |
| >>> aligned_r.sort_index() |
| a c |
| 10 4.0 d |
| 11 5.0 e |
| 12 6.0 f |
| 20 NaN None |
| 30 NaN None |
| |
| Align only axis=1 (column): |
| |
| >>> aligned_l, aligned_r = df1.align(df2, axis=1) |
| >>> aligned_l.sort_index() |
| a b c |
| 10 1 a NaN |
| 20 2 b NaN |
| 30 3 c NaN |
| >>> aligned_r.sort_index() |
| a b c |
| 10 4 NaN d |
| 11 5 NaN e |
| 12 6 NaN f |
| |
| Align with the join type "inner": |
| |
| >>> aligned_l, aligned_r = df1.align(df2, join="inner") |
| >>> aligned_l.sort_index() |
| a |
| 10 1 |
| >>> aligned_r.sort_index() |
| a |
| 10 4 |
| |
| Align with a Series: |
| |
| >>> s = ps.Series([7, 8, 9], index=[10, 11, 12]) |
| >>> aligned_l, aligned_r = df1.align(s, axis=0) |
| >>> aligned_l.sort_index() |
| a b |
| 10 1.0 a |
| 11 NaN None |
| 12 NaN None |
| 20 2.0 b |
| 30 3.0 c |
| >>> aligned_r.sort_index() |
| 10 7.0 |
| 11 8.0 |
| 12 9.0 |
| 20 NaN |
| 30 NaN |
| dtype: float64 |
| |
| >>> ps.reset_option("compute.ops_on_diff_frames") |
| """ |
| from pyspark.pandas.series import Series, first_series |
| |
| if not isinstance(other, (DataFrame, Series)): |
| raise TypeError("unsupported type: {}".format(type(other).__name__)) |
| |
| how = validate_how(join) |
| axis = validate_axis(axis, None) |
| |
| right_is_series = isinstance(other, Series) |
| if right_is_series: |
| if axis is None: |
| raise ValueError("Must specify axis=0 or 1") |
| elif axis != 0: |
| raise NotImplementedError( |
| "align currently only works for axis=0 when right is Series" |
| ) |
| |
| left = self |
| right = other |
| |
| if (axis is None or axis == 0) and not same_anchor(left, right): |
| combined = combine_frames(left, right, how=how) |
| left = combined["this"] |
| right = combined["that"] |
| |
| if right_is_series: |
| right = first_series(right).rename(other.name) |
| |
| if ( |
| axis is None or axis == 1 |
| ) and left._internal.column_labels != right._internal.column_labels: |
| |
| if left._internal.column_labels_level != right._internal.column_labels_level: |
| raise ValueError("cannot join with no overlapping index names") |
| |
| left = left.copy() |
| right = right.copy() |
| |
| if how == "full": |
| column_labels = sorted( |
| list(set(left._internal.column_labels) | set(right._internal.column_labels)) |
| ) |
| elif how == "inner": |
| column_labels = sorted( |
| list(set(left._internal.column_labels) & set(right._internal.column_labels)) |
| ) |
| elif how == "left": |
| column_labels = left._internal.column_labels |
| else: |
| column_labels = right._internal.column_labels |
| |
| for label in column_labels: |
| if label not in left._internal.column_labels: |
| left[label] = SF.lit(None).cast(DoubleType()) |
| left = left[column_labels] |
| for label in column_labels: |
| if label not in right._internal.column_labels: |
| right[label] = SF.lit(None).cast(DoubleType()) |
| right = right[column_labels] |
| |
| return (left.copy(), right.copy()) if copy else (left, right) |
| |
| @staticmethod |
| def from_dict( |
| data: Dict[Name, Sequence[Any]], |
| orient: str = "columns", |
| dtype: Union[str, Dtype] = None, |
| columns: Optional[List[Name]] = None, |
| ) -> "DataFrame": |
| """ |
| Construct DataFrame from dict of array-like or dicts. |
| |
| Creates DataFrame object from dictionary by columns or by index |
| allowing dtype specification. |
| |
| Parameters |
| ---------- |
| data : dict |
| Of the form {field : array-like} or {field : dict}. |
| orient : {'columns', 'index'}, default 'columns' |
| The "orientation" of the data. If the keys of the passed dict |
| should be the columns of the resulting DataFrame, pass 'columns' |
| (default). Otherwise if the keys should be rows, pass 'index'. |
| dtype : dtype, default None |
| Data type to force, otherwise infer. |
| columns : list, default None |
| Column labels to use when ``orient='index'``. Raises a ValueError |
| if used with ``orient='columns'``. |
| |
| Returns |
| ------- |
| DataFrame |
| |
| See Also |
| -------- |
| DataFrame.from_records : DataFrame from structured ndarray, sequence |
| of tuples or dicts, or DataFrame. |
| DataFrame : DataFrame object creation using constructor. |
| |
| Examples |
| -------- |
| By default the keys of the dict become the DataFrame columns: |
| |
| >>> data = {'col_1': [3, 2, 1, 0], 'col_2': [10, 20, 30, 40]} |
| >>> ps.DataFrame.from_dict(data) |
| col_1 col_2 |
| 0 3 10 |
| 1 2 20 |
| 2 1 30 |
| 3 0 40 |
| |
| Specify ``orient='index'`` to create the DataFrame using dictionary |
| keys as rows: |
| |
| >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 20, 30, 40]} |
| >>> ps.DataFrame.from_dict(data, orient='index').sort_index() |
| 0 1 2 3 |
| row_1 3 2 1 0 |
| row_2 10 20 30 40 |
| |
| When using the 'index' orientation, the column names can be |
| specified manually: |
| |
| >>> ps.DataFrame.from_dict(data, orient='index', |
| ... columns=['A', 'B', 'C', 'D']).sort_index() |
| A B C D |
| row_1 3 2 1 0 |
| row_2 10 20 30 40 |
| """ |
| return DataFrame(pd.DataFrame.from_dict(data, orient=orient, dtype=dtype, columns=columns)) |
| |
| # Override the `groupby` to specify the actual return type annotation. |
| def groupby( |
| self, |
| by: Union[Name, "Series", List[Union[Name, "Series"]]], |
| axis: Axis = 0, |
| as_index: bool = True, |
| dropna: bool = True, |
| ) -> "DataFrameGroupBy": |
| return cast( |
| "DataFrameGroupBy", super().groupby(by=by, axis=axis, as_index=as_index, dropna=dropna) |
| ) |
| |
| groupby.__doc__ = Frame.groupby.__doc__ |
| |
| def _build_groupby( |
| self, by: List[Union["Series", Label]], as_index: bool, dropna: bool |
| ) -> "DataFrameGroupBy": |
| from pyspark.pandas.groupby import DataFrameGroupBy |
| |
| return DataFrameGroupBy._build(self, by, as_index=as_index, dropna=dropna) |
| |
| def _to_internal_pandas(self) -> pd.DataFrame: |
| """ |
| Return a pandas DataFrame directly from _internal to avoid overhead of copy. |
| |
| This method is for internal use only. |
| """ |
| return self._internal.to_pandas_frame |
| |
| def _get_or_create_repr_pandas_cache(self, n: int) -> pd.DataFrame: |
| if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache: |
| object.__setattr__( |
| self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()} |
| ) |
| return self._repr_pandas_cache[n] |
| |
| def __repr__(self) -> str: |
| max_display_count = get_option("display.max_rows") |
| if max_display_count is None: |
| return self._to_internal_pandas().to_string() |
| |
| pdf = self._get_or_create_repr_pandas_cache(max_display_count) |
| pdf_length = len(pdf) |
| pdf = pdf.iloc[:max_display_count] |
| if pdf_length > max_display_count: |
| repr_string = pdf.to_string(show_dimensions=True) |
| match = REPR_PATTERN.search(repr_string) |
| if match is not None: |
| nrows = match.group("rows") |
| ncols = match.group("columns") |
| footer = "\n\n[Showing only the first {nrows} rows x {ncols} columns]".format( |
| nrows=nrows, ncols=ncols |
| ) |
| return REPR_PATTERN.sub(footer, repr_string) |
| return pdf.to_string() |
| |
| def _repr_html_(self) -> str: |
| max_display_count = get_option("display.max_rows") |
| # pandas 0.25.1 has a regression about HTML representation so 'bold_rows' |
| # has to be set as False explicitly. See https://github.com/pandas-dev/pandas/issues/28204 |
| bold_rows = not (LooseVersion("0.25.1") == LooseVersion(pd.__version__)) |
| if max_display_count is None: |
| return self._to_internal_pandas().to_html(notebook=True, bold_rows=bold_rows) |
| |
| pdf = self._get_or_create_repr_pandas_cache(max_display_count) |
| pdf_length = len(pdf) |
| pdf = pdf.iloc[:max_display_count] |
| if pdf_length > max_display_count: |
| repr_html = pdf.to_html(show_dimensions=True, notebook=True, bold_rows=bold_rows) |
| match = REPR_HTML_PATTERN.search(repr_html) |
| if match is not None: |
| nrows = match.group("rows") |
| ncols = match.group("columns") |
| by = chr(215) |
| footer = ( |
| "\n<p>Showing only the first {rows} rows " |
| "{by} {cols} columns</p>\n</div>".format(rows=nrows, by=by, cols=ncols) |
| ) |
| return REPR_HTML_PATTERN.sub(footer, repr_html) |
| return pdf.to_html(notebook=True, bold_rows=bold_rows) |
| |
| def __getitem__(self, key: Any) -> Any: |
| from pyspark.pandas.series import Series |
| |
| if key is None: |
| raise KeyError("none key") |
| elif isinstance(key, Series): |
| return self.loc[key.astype(bool)] |
| elif isinstance(key, slice): |
| if any(type(n) == int or None for n in [key.start, key.stop]): |
| # Seems like pandas Frame always uses int as positional search when slicing |
| # with ints. |
| return self.iloc[key] |
| return self.loc[key] |
| elif is_name_like_value(key): |
| return self.loc[:, key] |
| elif is_list_like(key): |
| return self.loc[:, list(key)] |
| raise NotImplementedError(key) |
| |
| def __setitem__(self, key: Any, value: Any) -> None: |
| from pyspark.pandas.series import Series |
| |
| if isinstance(value, (DataFrame, Series)) and not same_anchor(value, self): |
| # Different Series or DataFrames |
| level = self._internal.column_labels_level |
| key = DataFrame._index_normalized_label(level, key) |
| value = DataFrame._index_normalized_frame(level, value) |
| |
| def assign_columns( |
| psdf: DataFrame, this_column_labels: List[Label], that_column_labels: List[Label] |
| ) -> Iterator[Tuple["Series", Label]]: |
| assert len(key) == len(that_column_labels) |
| # Note that here intentionally uses `zip_longest` that combine |
| # that_columns. |
| for k, this_label, that_label in zip_longest( |
| key, this_column_labels, that_column_labels |
| ): |
| yield (psdf._psser_for(that_label), tuple(["that", *k])) |
| if this_label is not None and this_label[1:] != k: |
| yield (psdf._psser_for(this_label), this_label) |
| |
| psdf = align_diff_frames(assign_columns, self, value, fillna=False, how="left") |
| elif isinstance(value, list): |
| if len(self) != len(value): |
| raise ValueError("Length of values does not match length of index") |
| |
| # TODO: avoid using default index? |
| with option_context( |
| "compute.default_index_type", |
| "distributed-sequence", |
| "compute.ops_on_diff_frames", |
| True, |
| ): |
| psdf = self.reset_index() |
| psdf[key] = ps.DataFrame(value) |
| psdf = psdf.set_index(psdf.columns[: self._internal.index_level]) |
| psdf.index.names = self.index.names |
| |
| elif isinstance(key, list): |
| assert isinstance(value, DataFrame) |
| # Same DataFrames. |
| field_names = value.columns |
| psdf = self._assign({k: value[c] for k, c in zip(key, field_names)}) |
| else: |
| # Same Series. |
| psdf = self._assign({key: value}) |
| |
| self._update_internal_frame(psdf._internal) |
| |
| @staticmethod |
| def _index_normalized_label(level: int, labels: Union[Name, Sequence[Name]]) -> List[Label]: |
| """ |
| Returns a label that is normalized against the current column index level. |
| For example, the key "abc" can be ("abc", "", "") if the current Frame has |
| a multi-index for its column |
| """ |
| if is_name_like_tuple(labels): |
| labels = [labels] |
| elif is_name_like_value(labels): |
| labels = [(labels,)] |
| else: |
| labels = [k if is_name_like_tuple(k) else (k,) for k in labels] |
| |
| if any(len(label) > level for label in labels): |
| raise KeyError( |
| "Key length ({}) exceeds index depth ({})".format( |
| max(len(label) for label in labels), level |
| ) |
| ) |
| return [tuple(list(label) + ([""] * (level - len(label)))) for label in labels] |
| |
| @staticmethod |
| def _index_normalized_frame(level: int, psser_or_psdf: DataFrameOrSeries) -> "DataFrame": |
| """ |
| Returns a frame that is normalized against the current column index level. |
| For example, the name in `pd.Series([...], name="abc")` can be can be |
| ("abc", "", "") if the current DataFrame has a multi-index for its column |
| """ |
| from pyspark.pandas.series import Series |
| |
| if isinstance(psser_or_psdf, Series): |
| psdf = psser_or_psdf.to_frame() |
| else: |
| assert isinstance(psser_or_psdf, DataFrame), type(psser_or_psdf) |
| psdf = psser_or_psdf.copy() |
| |
| psdf.columns = pd.MultiIndex.from_tuples( |
| [ |
| tuple([name_like_string(label)] + ([""] * (level - 1))) |
| for label in psdf._internal.column_labels |
| ], |
| ) |
| |
| return psdf |
| |
| def __getattr__(self, key: str) -> Any: |
| if key.startswith("__"): |
| raise AttributeError(key) |
| if hasattr(_MissingPandasLikeDataFrame, key): |
| property_or_func = getattr(_MissingPandasLikeDataFrame, key) |
| if isinstance(property_or_func, property): |
| return property_or_func.fget(self) # type: ignore |
| else: |
| return partial(property_or_func, self) |
| |
| try: |
| return self.loc[:, key] |
| except KeyError: |
| raise AttributeError( |
| "'%s' object has no attribute '%s'" % (self.__class__.__name__, key) |
| ) |
| |
| def __setattr__(self, key: str, value: Any) -> None: |
| try: |
| object.__getattribute__(self, key) |
| return object.__setattr__(self, key, value) |
| except AttributeError: |
| pass |
| |
| if (key,) in self._internal.column_labels: |
| self[key] = value |
| else: |
| msg = "pandas-on-Spark doesn't allow columns to be created via a new attribute name" |
| if is_testing(): |
| raise AssertionError(msg) |
| else: |
| warnings.warn(msg, UserWarning) |
| |
| def __len__(self) -> int: |
| return self._internal.resolved_copy.spark_frame.count() |
| |
| def __dir__(self) -> Iterable[str]: |
| fields = [ |
| f for f in self._internal.resolved_copy.spark_frame.schema.fieldNames() if " " not in f |
| ] |
| return list(super().__dir__()) + fields |
| |
| def __iter__(self) -> Iterator[Name]: |
| return iter(self.columns) |
| |
| # NDArray Compat |
| def __array_ufunc__( |
| self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any |
| ) -> "DataFrame": |
| # TODO: is it possible to deduplicate it with '_map_series_op'? |
| if all(isinstance(inp, DataFrame) for inp in inputs) and any( |
| not same_anchor(inp, inputs[0]) for inp in inputs |
| ): |
| # binary only |
| assert len(inputs) == 2 |
| this = inputs[0] |
| that = inputs[1] |
| if this._internal.column_labels_level != that._internal.column_labels_level: |
| raise ValueError("cannot join with no overlapping index names") |
| |
| # Different DataFrames |
| def apply_op( |
| psdf: DataFrame, this_column_labels: List[Label], that_column_labels: List[Label] |
| ) -> Iterator[Tuple["Series", Label]]: |
| for this_label, that_label in zip(this_column_labels, that_column_labels): |
| yield ( |
| ufunc( |
| psdf._psser_for(this_label), psdf._psser_for(that_label), **kwargs |
| ).rename(this_label), |
| this_label, |
| ) |
| |
| return align_diff_frames(apply_op, this, that, fillna=True, how="full") |
| else: |
| # DataFrame and Series |
| applied = [] |
| this = inputs[0] |
| assert all(inp is this for inp in inputs if isinstance(inp, DataFrame)) |
| |
| for label in this._internal.column_labels: |
| arguments = [] |
| for inp in inputs: |
| arguments.append(inp[label] if isinstance(inp, DataFrame) else inp) |
| # both binary and unary. |
| applied.append(ufunc(*arguments, **kwargs).rename(label)) |
| |
| internal = this._internal.with_new_columns(applied) |
| return DataFrame(internal) |
| |
| if sys.version_info >= (3, 7): |
| |
| def __class_getitem__(cls, params: Any) -> object: |
| # This is a workaround to support variadic generic in DataFrame in Python 3.7. |
| # See https://github.com/python/typing/issues/193 |
| # we always wraps the given type hints by a tuple to mimic the variadic generic. |
| return _create_tuple_for_frame_type(params) |
| |
| elif (3, 5) <= sys.version_info < (3, 7): |
| # This is a workaround to support variadic generic in DataFrame in Python 3.5+ |
| # The implementation is in its metaclass so this flag is needed to distinguish |
| # pandas-on-Spark DataFrame. |
| is_dataframe = None |
| |
| |
| def _reduce_spark_multi(sdf: SparkDataFrame, aggs: List[Column]) -> Any: |
| """ |
| Performs a reduction on a spark DataFrame, the functions being known sql aggregate functions. |
| """ |
| assert isinstance(sdf, SparkDataFrame) |
| sdf0 = sdf.agg(*aggs) |
| l = cast(pd.DataFrame, sdf0.limit(2).toPandas()) |
| assert len(l) == 1, (sdf, l) |
| row = l.iloc[0] |
| l2 = list(row) |
| assert len(l2) == len(aggs), (row, l2) |
| return l2 |
| |
| |
| class CachedDataFrame(DataFrame): |
| """ |
| Cached pandas-on-Spark DataFrame, which corresponds to pandas DataFrame logically, but |
| internally it caches the corresponding Spark DataFrame. |
| """ |
| |
| def __init__(self, internal: InternalFrame, storage_level: Optional[StorageLevel] = None): |
| if storage_level is None: |
| object.__setattr__(self, "_cached", internal.spark_frame.cache()) |
| elif isinstance(storage_level, StorageLevel): |
| object.__setattr__(self, "_cached", internal.spark_frame.persist(storage_level)) |
| else: |
| raise TypeError( |
| "Only a valid pyspark.StorageLevel type is acceptable for the `storage_level`" |
| ) |
| super().__init__(internal) |
| |
| def __enter__(self) -> "CachedDataFrame": |
| return self |
| |
| def __exit__( |
| self, |
| exception_type: Optional[Type[BaseException]], |
| exception_value: Optional[BaseException], |
| traceback: Optional[TracebackType], |
| ) -> Optional[bool]: |
| self.spark.unpersist() |
| return None |
| |
| # create accessor for Spark related methods. |
| spark = CachedAccessor("spark", CachedSparkFrameMethods) |
| |
| |
| def _test() -> None: |
| import os |
| import doctest |
| import shutil |
| import sys |
| import tempfile |
| import uuid |
| from pyspark.sql import SparkSession |
| import pyspark.pandas.frame |
| |
| os.chdir(os.environ["SPARK_HOME"]) |
| |
| globs = pyspark.pandas.frame.__dict__.copy() |
| globs["ps"] = pyspark.pandas |
| spark = ( |
| SparkSession.builder.master("local[4]").appName("pyspark.pandas.frame tests").getOrCreate() |
| ) |
| |
| db_name = "db%s" % str(uuid.uuid4()).replace("-", "") |
| spark.sql("CREATE DATABASE %s" % db_name) |
| globs["db"] = db_name |
| |
| path = tempfile.mkdtemp() |
| globs["path"] = path |
| |
| (failure_count, test_count) = doctest.testmod( |
| pyspark.pandas.frame, |
| globs=globs, |
| optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, |
| ) |
| |
| shutil.rmtree(path, ignore_errors=True) |
| spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name) |
| spark.stop() |
| if failure_count: |
| sys.exit(-1) |
| |
| |
| if __name__ == "__main__": |
| _test() |