| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| from typing import overload |
| from typing import ( |
| Any, |
| Callable, |
| Dict, |
| Generic, |
| Hashable, |
| Iterable, |
| Iterator, |
| List, |
| Optional, |
| Tuple, |
| Union, |
| TypeVar, |
| ) |
| from typing_extensions import Literal |
| |
| from numpy import int32, int64, float32, float64, ndarray # type: ignore[import] |
| |
| from pyspark._typing import SupportsOrdering |
| from pyspark.sql.pandas._typing import ( |
| PandasScalarUDFType, |
| PandasScalarIterUDFType, |
| PandasGroupedMapUDFType, |
| PandasCogroupedMapUDFType, |
| PandasGroupedAggUDFType, |
| PandasMapIterUDFType, |
| ) |
| import pyspark.context |
| from pyspark.resultiterable import ResultIterable |
| from pyspark.serializers import Serializer |
| from pyspark.storagelevel import StorageLevel |
| from pyspark.resource.requests import ( # noqa: F401 |
| ExecutorResourceRequests, |
| TaskResourceRequests, |
| ) |
| from pyspark.resource.profile import ResourceProfile |
| from pyspark.statcounter import StatCounter |
| from pyspark.sql.dataframe import DataFrame |
| from pyspark.sql.types import StructType |
| from pyspark.sql._typing import RowLike |
| from py4j.java_gateway import JavaObject # type: ignore[import] |
| |
| T = TypeVar("T") |
| U = TypeVar("U") |
| K = TypeVar("K", bound=Hashable) |
| V = TypeVar("V") |
| V1 = TypeVar("V1") |
| V2 = TypeVar("V2") |
| V3 = TypeVar("V3") |
| O = TypeVar("O", bound=SupportsOrdering) |
| NumberOrArray = TypeVar( |
| "NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray |
| ) |
| |
| def portable_hash(x: Hashable) -> int: ... |
| |
| class PythonEvalType: |
| NON_UDF: Literal[0] |
| SQL_BATCHED_UDF: Literal[100] |
| SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType |
| SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType |
| SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType |
| SQL_WINDOW_AGG_PANDAS_UDF: Literal[203] |
| SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType |
| SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType |
| SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType |
| |
| class BoundedFloat(float): |
| def __new__( |
| cls, mean: float, confidence: float, low: float, high: float |
| ) -> BoundedFloat: ... |
| |
| class Partitioner: |
| numPartitions: int |
| partitionFunc: Callable[[Any], int] |
| def __init__( |
| self, numPartitions: int, partitionFunc: Callable[[Any], int] |
| ) -> None: ... |
| def __eq__(self, other: Any) -> bool: ... |
| def __call__(self, k: Any) -> int: ... |
| |
| class RDD(Generic[T]): |
| is_cached: bool |
| is_checkpointed: bool |
| ctx: pyspark.context.SparkContext |
| partitioner: Optional[Partitioner] |
| def __init__( |
| self, |
| jrdd: JavaObject, |
| ctx: pyspark.context.SparkContext, |
| jrdd_deserializer: Serializer = ..., |
| ) -> None: ... |
| def id(self) -> int: ... |
| def __getnewargs__(self) -> Any: ... |
| @property |
| def context(self) -> pyspark.context.SparkContext: ... |
| def cache(self) -> RDD[T]: ... |
| def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ... |
| def unpersist(self, blocking: bool = ...) -> RDD[T]: ... |
| def checkpoint(self) -> None: ... |
| def isCheckpointed(self) -> bool: ... |
| def localCheckpoint(self) -> None: ... |
| def isLocallyCheckpointed(self) -> bool: ... |
| def getCheckpointFile(self) -> Optional[str]: ... |
| def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ... |
| def flatMap( |
| self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ... |
| ) -> RDD[U]: ... |
| def mapPartitions( |
| self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ... |
| ) -> RDD[U]: ... |
| def mapPartitionsWithIndex( |
| self, |
| f: Callable[[int, Iterable[T]], Iterable[U]], |
| preservesPartitioning: bool = ..., |
| ) -> RDD[U]: ... |
| def mapPartitionsWithSplit( |
| self, |
| f: Callable[[int, Iterable[T]], Iterable[U]], |
| preservesPartitioning: bool = ..., |
| ) -> RDD[U]: ... |
| def getNumPartitions(self) -> int: ... |
| def filter(self, f: Callable[[T], bool]) -> RDD[T]: ... |
| def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ... |
| def sample( |
| self, withReplacement: bool, fraction: float, seed: Optional[int] = ... |
| ) -> RDD[T]: ... |
| def randomSplit( |
| self, weights: List[Union[int, float]], seed: Optional[int] = ... |
| ) -> List[RDD[T]]: ... |
| def takeSample( |
| self, withReplacement: bool, num: int, seed: Optional[int] = ... |
| ) -> List[T]: ... |
| def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ... |
| def intersection(self, other: RDD[T]) -> RDD[T]: ... |
| def __add__(self, other: RDD[T]) -> RDD[T]: ... |
| @overload |
| def repartitionAndSortWithinPartitions( |
| self: RDD[Tuple[O, V]], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[O], int] = ..., |
| ascending: bool = ..., |
| ) -> RDD[Tuple[O, V]]: ... |
| @overload |
| def repartitionAndSortWithinPartitions( |
| self: RDD[Tuple[K, V]], |
| numPartitions: Optional[int], |
| partitionFunc: Callable[[K], int], |
| ascending: bool, |
| keyfunc: Callable[[K], O], |
| ) -> RDD[Tuple[K, V]]: ... |
| @overload |
| def repartitionAndSortWithinPartitions( |
| self: RDD[Tuple[K, V]], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ascending: bool = ..., |
| *, |
| keyfunc: Callable[[K], O] |
| ) -> RDD[Tuple[K, V]]: ... |
| @overload |
| def sortByKey( |
| self: RDD[Tuple[O, V]], |
| ascending: bool = ..., |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| @overload |
| def sortByKey( |
| self: RDD[Tuple[K, V]], |
| ascending: bool, |
| numPartitions: int, |
| keyfunc: Callable[[K], O], |
| ) -> RDD[Tuple[K, V]]: ... |
| @overload |
| def sortByKey( |
| self: RDD[Tuple[K, V]], |
| ascending: bool = ..., |
| numPartitions: Optional[int] = ..., |
| *, |
| keyfunc: Callable[[K], O] |
| ) -> RDD[Tuple[K, V]]: ... |
| def sortBy( |
| self: RDD[T], |
| keyfunc: Callable[[T], O], |
| ascending: bool = ..., |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[T]: ... |
| def glom(self) -> RDD[List[T]]: ... |
| def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ... |
| def groupBy( |
| self, |
| f: Callable[[T], K], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, Iterable[T]]]: ... |
| def pipe( |
| self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ... |
| ) -> RDD[str]: ... |
| def foreach(self, f: Callable[[T], None]) -> None: ... |
| def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ... |
| def collect(self) -> List[T]: ... |
| def collectWithJobGroup( |
| self, groupId: str, description: str, interruptOnCancel: bool = ... |
| ) -> List[T]: ... |
| def reduce(self, f: Callable[[T, T], T]) -> T: ... |
| def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ... |
| def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ... |
| def aggregate( |
| self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U] |
| ) -> U: ... |
| def treeAggregate( |
| self, |
| zeroValue: U, |
| seqOp: Callable[[U, T], U], |
| combOp: Callable[[U, U], U], |
| depth: int = ..., |
| ) -> U: ... |
| @overload |
| def max(self: RDD[O]) -> O: ... |
| @overload |
| def max(self, key: Callable[[T], O]) -> T: ... |
| @overload |
| def min(self: RDD[O]) -> O: ... |
| @overload |
| def min(self, key: Callable[[T], O]) -> T: ... |
| def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def count(self) -> int: ... |
| def stats(self: RDD[NumberOrArray]) -> StatCounter: ... |
| def histogram(self, buckets: List[T]) -> Tuple[List[T], List[int]]: ... |
| def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ... |
| def countByValue(self: RDD[K]) -> Dict[K, int]: ... |
| @overload |
| def top(self: RDD[O], num: int) -> List[O]: ... |
| @overload |
| def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ... |
| @overload |
| def takeOrdered(self: RDD[O], num: int) -> List[O]: ... |
| @overload |
| def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ... |
| def take(self, num: int) -> List[T]: ... |
| def first(self) -> T: ... |
| def isEmpty(self) -> bool: ... |
| def saveAsNewAPIHadoopDataset( |
| self: RDD[Tuple[K, V]], |
| conf: Dict[str, str], |
| keyConverter: Optional[str] = ..., |
| valueConverter: Optional[str] = ..., |
| ) -> None: ... |
| def saveAsNewAPIHadoopFile( |
| self: RDD[Tuple[K, V]], |
| path: str, |
| outputFormatClass: str, |
| keyClass: Optional[str] = ..., |
| valueClass: Optional[str] = ..., |
| keyConverter: Optional[str] = ..., |
| valueConverter: Optional[str] = ..., |
| conf: Optional[Dict[str, str]] = ..., |
| ) -> None: ... |
| def saveAsHadoopDataset( |
| self: RDD[Tuple[K, V]], |
| conf: Dict[str, str], |
| keyConverter: Optional[str] = ..., |
| valueConverter: Optional[str] = ..., |
| ) -> None: ... |
| def saveAsHadoopFile( |
| self: RDD[Tuple[K, V]], |
| path: str, |
| outputFormatClass: str, |
| keyClass: Optional[str] = ..., |
| valueClass: Optional[str] = ..., |
| keyConverter: Optional[str] = ..., |
| valueConverter: Optional[str] = ..., |
| conf: Optional[str] = ..., |
| compressionCodecClass: Optional[str] = ..., |
| ) -> None: ... |
| def saveAsSequenceFile( |
| self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ... |
| ) -> None: ... |
| def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ... |
| def saveAsTextFile( |
| self, path: str, compressionCodecClass: Optional[str] = ... |
| ) -> None: ... |
| def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ... |
| def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ... |
| def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ... |
| def reduceByKey( |
| self: RDD[Tuple[K, V]], |
| func: Callable[[V, V], V], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| def reduceByKeyLocally( |
| self: RDD[Tuple[K, V]], func: Callable[[V, V], V] |
| ) -> Dict[K, V]: ... |
| def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ... |
| def join( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, Tuple[V, U]]]: ... |
| def leftOuterJoin( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ... |
| def rightOuterJoin( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ... |
| def fullOuterJoin( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ... |
| def partitionBy( |
| self: RDD[Tuple[K, V]], |
| numPartitions: int, |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| def combineByKey( |
| self: RDD[Tuple[K, V]], |
| createCombiner: Callable[[V], U], |
| mergeValue: Callable[[U, V], U], |
| mergeCombiners: Callable[[U, U], U], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, U]]: ... |
| def aggregateByKey( |
| self: RDD[Tuple[K, V]], |
| zeroValue: U, |
| seqFunc: Callable[[U, V], U], |
| combFunc: Callable[[U, U], U], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, U]]: ... |
| def foldByKey( |
| self: RDD[Tuple[K, V]], |
| zeroValue: V, |
| func: Callable[[V, V], V], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| def groupByKey( |
| self: RDD[Tuple[K, V]], |
| numPartitions: Optional[int] = ..., |
| partitionFunc: Callable[[K], int] = ..., |
| ) -> RDD[Tuple[K, Iterable[V]]]: ... |
| def flatMapValues( |
| self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]] |
| ) -> RDD[Tuple[K, U]]: ... |
| def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ... |
| @overload |
| def groupWith( |
| self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]] |
| ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ... |
| @overload |
| def groupWith( |
| self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]] |
| ) -> RDD[ |
| Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]] |
| ]: ... |
| @overload |
| def groupWith( |
| self: RDD[Tuple[K, V]], |
| other1: RDD[Tuple[K, V1]], |
| other2: RDD[Tuple[K, V2]], |
| other3: RDD[Tuple[K, V3]], |
| ) -> RDD[ |
| Tuple[ |
| K, |
| Tuple[ |
| ResultIterable[V], |
| ResultIterable[V1], |
| ResultIterable[V2], |
| ResultIterable[V3], |
| ], |
| ] |
| ]: ... |
| def cogroup( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ... |
| def sampleByKey( |
| self: RDD[Tuple[K, V]], |
| withReplacement: bool, |
| fractions: Dict[K, Union[float, int]], |
| seed: Optional[int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| def subtractByKey( |
| self: RDD[Tuple[K, V]], |
| other: RDD[Tuple[K, U]], |
| numPartitions: Optional[int] = ..., |
| ) -> RDD[Tuple[K, V]]: ... |
| def subtract( |
| self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ... |
| ) -> RDD[T]: ... |
| def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ... |
| def repartition(self, numPartitions: int) -> RDD[T]: ... |
| def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ... |
| def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ... |
| def zipWithIndex(self) -> RDD[Tuple[T, int]]: ... |
| def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ... |
| def name(self) -> str: ... |
| def setName(self, name: str) -> RDD[T]: ... |
| def toDebugString(self) -> bytes: ... |
| def getStorageLevel(self) -> StorageLevel: ... |
| def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ... |
| def countApprox(self, timeout: int, confidence: float = ...) -> int: ... |
| def sumApprox( |
| self: RDD[Union[float, int]], timeout: int, confidence: float = ... |
| ) -> BoundedFloat: ... |
| def meanApprox( |
| self: RDD[Union[float, int]], timeout: int, confidence: float = ... |
| ) -> BoundedFloat: ... |
| def countApproxDistinct(self, relativeSD: float = ...) -> int: ... |
| def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ... |
| def barrier(self: RDD[T]) -> RDDBarrier[T]: ... |
| def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ... |
| def getResourceProfile(self) -> Optional[ResourceProfile]: ... |
| @overload |
| def toDF( |
| self: RDD[RowLike], |
| schema: Optional[List[str]] = ..., |
| sampleRatio: Optional[float] = ..., |
| ) -> DataFrame: ... |
| @overload |
| def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ... |
| |
| class RDDBarrier(Generic[T]): |
| rdd: RDD[T] |
| def __init__(self, rdd: RDD[T]) -> None: ... |
| def mapPartitions( |
| self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ... |
| ) -> RDD[U]: ... |
| def mapPartitionsWithIndex( |
| self, |
| f: Callable[[int, Iterable[T]], Iterable[U]], |
| preservesPartitioning: bool = ..., |
| ) -> RDD[U]: ... |
| |
| class PipelinedRDD(RDD[U], Generic[T, U]): |
| func: Callable[[T], U] |
| preservesPartitioning: bool |
| is_cached: bool |
| is_checkpointed: bool |
| ctx: pyspark.context.SparkContext |
| prev: RDD[T] |
| partitioner: Optional[Partitioner] |
| is_barrier: bool |
| def __init__( |
| self, |
| prev: RDD[T], |
| func: Callable[[Iterable[T]], Iterable[U]], |
| preservesPartitioning: bool = ..., |
| isFromBarrier: bool = ..., |
| ) -> None: ... |
| def getNumPartitions(self) -> int: ... |
| def id(self) -> int: ... |