blob: 317a0db4b7eb9137c3edf3f22027746d42be98b7 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import overload
from typing import (
Any,
Callable,
Dict,
Generic,
Hashable,
Iterable,
Iterator,
List,
Optional,
Tuple,
Union,
TypeVar,
)
from typing_extensions import Literal
from numpy import int32, int64, float32, float64, ndarray # type: ignore[import]
from pyspark._typing import SupportsOrdering
from pyspark.sql.pandas._typing import (
PandasScalarUDFType,
PandasScalarIterUDFType,
PandasGroupedMapUDFType,
PandasCogroupedMapUDFType,
PandasGroupedAggUDFType,
PandasMapIterUDFType,
)
import pyspark.context
from pyspark.resultiterable import ResultIterable
from pyspark.serializers import Serializer
from pyspark.storagelevel import StorageLevel
from pyspark.resource.requests import ( # noqa: F401
ExecutorResourceRequests,
TaskResourceRequests,
)
from pyspark.resource.profile import ResourceProfile
from pyspark.statcounter import StatCounter
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.types import StructType
from pyspark.sql._typing import RowLike
from py4j.java_gateway import JavaObject # type: ignore[import]
T = TypeVar("T")
U = TypeVar("U")
K = TypeVar("K", bound=Hashable)
V = TypeVar("V")
V1 = TypeVar("V1")
V2 = TypeVar("V2")
V3 = TypeVar("V3")
O = TypeVar("O", bound=SupportsOrdering)
NumberOrArray = TypeVar(
"NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray
)
def portable_hash(x: Hashable) -> int: ...
class PythonEvalType:
NON_UDF: Literal[0]
SQL_BATCHED_UDF: Literal[100]
SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType
SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType
SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType
SQL_WINDOW_AGG_PANDAS_UDF: Literal[203]
SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType
SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType
SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType
class BoundedFloat(float):
def __new__(
cls, mean: float, confidence: float, low: float, high: float
) -> BoundedFloat: ...
class Partitioner:
numPartitions: int
partitionFunc: Callable[[Any], int]
def __init__(
self, numPartitions: int, partitionFunc: Callable[[Any], int]
) -> None: ...
def __eq__(self, other: Any) -> bool: ...
def __call__(self, k: Any) -> int: ...
class RDD(Generic[T]):
is_cached: bool
is_checkpointed: bool
ctx: pyspark.context.SparkContext
partitioner: Optional[Partitioner]
def __init__(
self,
jrdd: JavaObject,
ctx: pyspark.context.SparkContext,
jrdd_deserializer: Serializer = ...,
) -> None: ...
def id(self) -> int: ...
def __getnewargs__(self) -> Any: ...
@property
def context(self) -> pyspark.context.SparkContext: ...
def cache(self) -> RDD[T]: ...
def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ...
def unpersist(self, blocking: bool = ...) -> RDD[T]: ...
def checkpoint(self) -> None: ...
def isCheckpointed(self) -> bool: ...
def localCheckpoint(self) -> None: ...
def isLocallyCheckpointed(self) -> bool: ...
def getCheckpointFile(self) -> Optional[str]: ...
def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ...
def flatMap(
self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ...
) -> RDD[U]: ...
def mapPartitions(
self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
) -> RDD[U]: ...
def mapPartitionsWithIndex(
self,
f: Callable[[int, Iterable[T]], Iterable[U]],
preservesPartitioning: bool = ...,
) -> RDD[U]: ...
def mapPartitionsWithSplit(
self,
f: Callable[[int, Iterable[T]], Iterable[U]],
preservesPartitioning: bool = ...,
) -> RDD[U]: ...
def getNumPartitions(self) -> int: ...
def filter(self, f: Callable[[T], bool]) -> RDD[T]: ...
def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ...
def sample(
self, withReplacement: bool, fraction: float, seed: Optional[int] = ...
) -> RDD[T]: ...
def randomSplit(
self, weights: List[Union[int, float]], seed: Optional[int] = ...
) -> List[RDD[T]]: ...
def takeSample(
self, withReplacement: bool, num: int, seed: Optional[int] = ...
) -> List[T]: ...
def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ...
def intersection(self, other: RDD[T]) -> RDD[T]: ...
def __add__(self, other: RDD[T]) -> RDD[T]: ...
@overload
def repartitionAndSortWithinPartitions(
self: RDD[Tuple[O, V]],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[O], int] = ...,
ascending: bool = ...,
) -> RDD[Tuple[O, V]]: ...
@overload
def repartitionAndSortWithinPartitions(
self: RDD[Tuple[K, V]],
numPartitions: Optional[int],
partitionFunc: Callable[[K], int],
ascending: bool,
keyfunc: Callable[[K], O],
) -> RDD[Tuple[K, V]]: ...
@overload
def repartitionAndSortWithinPartitions(
self: RDD[Tuple[K, V]],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
ascending: bool = ...,
*,
keyfunc: Callable[[K], O]
) -> RDD[Tuple[K, V]]: ...
@overload
def sortByKey(
self: RDD[Tuple[O, V]],
ascending: bool = ...,
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, V]]: ...
@overload
def sortByKey(
self: RDD[Tuple[K, V]],
ascending: bool,
numPartitions: int,
keyfunc: Callable[[K], O],
) -> RDD[Tuple[K, V]]: ...
@overload
def sortByKey(
self: RDD[Tuple[K, V]],
ascending: bool = ...,
numPartitions: Optional[int] = ...,
*,
keyfunc: Callable[[K], O]
) -> RDD[Tuple[K, V]]: ...
def sortBy(
self: RDD[T],
keyfunc: Callable[[T], O],
ascending: bool = ...,
numPartitions: Optional[int] = ...,
) -> RDD[T]: ...
def glom(self) -> RDD[List[T]]: ...
def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
def groupBy(
self,
f: Callable[[T], K],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, Iterable[T]]]: ...
def pipe(
self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ...
) -> RDD[str]: ...
def foreach(self, f: Callable[[T], None]) -> None: ...
def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ...
def collect(self) -> List[T]: ...
def collectWithJobGroup(
self, groupId: str, description: str, interruptOnCancel: bool = ...
) -> List[T]: ...
def reduce(self, f: Callable[[T, T], T]) -> T: ...
def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ...
def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ...
def aggregate(
self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]
) -> U: ...
def treeAggregate(
self,
zeroValue: U,
seqOp: Callable[[U, T], U],
combOp: Callable[[U, U], U],
depth: int = ...,
) -> U: ...
@overload
def max(self: RDD[O]) -> O: ...
@overload
def max(self, key: Callable[[T], O]) -> T: ...
@overload
def min(self: RDD[O]) -> O: ...
@overload
def min(self, key: Callable[[T], O]) -> T: ...
def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def count(self) -> int: ...
def stats(self: RDD[NumberOrArray]) -> StatCounter: ...
def histogram(self, buckets: Union[int, List[T], Tuple[T, ...]]) -> Tuple[List[T], List[int]]: ...
def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
def countByValue(self: RDD[K]) -> Dict[K, int]: ...
@overload
def top(self: RDD[O], num: int) -> List[O]: ...
@overload
def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
@overload
def takeOrdered(self: RDD[O], num: int) -> List[O]: ...
@overload
def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
def take(self, num: int) -> List[T]: ...
def first(self) -> T: ...
def isEmpty(self) -> bool: ...
def saveAsNewAPIHadoopDataset(
self: RDD[Tuple[K, V]],
conf: Dict[str, str],
keyConverter: Optional[str] = ...,
valueConverter: Optional[str] = ...,
) -> None: ...
def saveAsNewAPIHadoopFile(
self: RDD[Tuple[K, V]],
path: str,
outputFormatClass: str,
keyClass: Optional[str] = ...,
valueClass: Optional[str] = ...,
keyConverter: Optional[str] = ...,
valueConverter: Optional[str] = ...,
conf: Optional[Dict[str, str]] = ...,
) -> None: ...
def saveAsHadoopDataset(
self: RDD[Tuple[K, V]],
conf: Dict[str, str],
keyConverter: Optional[str] = ...,
valueConverter: Optional[str] = ...,
) -> None: ...
def saveAsHadoopFile(
self: RDD[Tuple[K, V]],
path: str,
outputFormatClass: str,
keyClass: Optional[str] = ...,
valueClass: Optional[str] = ...,
keyConverter: Optional[str] = ...,
valueConverter: Optional[str] = ...,
conf: Optional[str] = ...,
compressionCodecClass: Optional[str] = ...,
) -> None: ...
def saveAsSequenceFile(
self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ...
) -> None: ...
def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ...
def saveAsTextFile(
self, path: str, compressionCodecClass: Optional[str] = ...
) -> None: ...
def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ...
def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ...
def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ...
def reduceByKey(
self: RDD[Tuple[K, V]],
func: Callable[[V, V], V],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, V]]: ...
def reduceByKeyLocally(
self: RDD[Tuple[K, V]], func: Callable[[V, V], V]
) -> Dict[K, V]: ...
def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ...
def join(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, Tuple[V, U]]]: ...
def leftOuterJoin(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ...
def rightOuterJoin(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ...
def fullOuterJoin(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ...
def partitionBy(
self: RDD[Tuple[K, V]],
numPartitions: int,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, V]]: ...
def combineByKey(
self: RDD[Tuple[K, V]],
createCombiner: Callable[[V], U],
mergeValue: Callable[[U, V], U],
mergeCombiners: Callable[[U, U], U],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, U]]: ...
def aggregateByKey(
self: RDD[Tuple[K, V]],
zeroValue: U,
seqFunc: Callable[[U, V], U],
combFunc: Callable[[U, U], U],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, U]]: ...
def foldByKey(
self: RDD[Tuple[K, V]],
zeroValue: V,
func: Callable[[V, V], V],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, V]]: ...
def groupByKey(
self: RDD[Tuple[K, V]],
numPartitions: Optional[int] = ...,
partitionFunc: Callable[[K], int] = ...,
) -> RDD[Tuple[K, Iterable[V]]]: ...
def flatMapValues(
self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]]
) -> RDD[Tuple[K, U]]: ...
def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ...
@overload
def groupWith(
self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]]
) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ...
@overload
def groupWith(
self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]]
) -> RDD[
Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]
]: ...
@overload
def groupWith(
self: RDD[Tuple[K, V]],
other1: RDD[Tuple[K, V1]],
other2: RDD[Tuple[K, V2]],
other3: RDD[Tuple[K, V3]],
) -> RDD[
Tuple[
K,
Tuple[
ResultIterable[V],
ResultIterable[V1],
ResultIterable[V2],
ResultIterable[V3],
],
]
]: ...
def cogroup(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ...
def sampleByKey(
self: RDD[Tuple[K, V]],
withReplacement: bool,
fractions: Dict[K, Union[float, int]],
seed: Optional[int] = ...,
) -> RDD[Tuple[K, V]]: ...
def subtractByKey(
self: RDD[Tuple[K, V]],
other: RDD[Tuple[K, U]],
numPartitions: Optional[int] = ...,
) -> RDD[Tuple[K, V]]: ...
def subtract(
self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ...
) -> RDD[T]: ...
def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ...
def repartition(self, numPartitions: int) -> RDD[T]: ...
def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ...
def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
def zipWithIndex(self) -> RDD[Tuple[T, int]]: ...
def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ...
def name(self) -> str: ...
def setName(self, name: str) -> RDD[T]: ...
def toDebugString(self) -> bytes: ...
def getStorageLevel(self) -> StorageLevel: ...
def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ...
def countApprox(self, timeout: int, confidence: float = ...) -> int: ...
def sumApprox(
self: RDD[Union[float, int]], timeout: int, confidence: float = ...
) -> BoundedFloat: ...
def meanApprox(
self: RDD[Union[float, int]], timeout: int, confidence: float = ...
) -> BoundedFloat: ...
def countApproxDistinct(self, relativeSD: float = ...) -> int: ...
def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ...
def barrier(self: RDD[T]) -> RDDBarrier[T]: ...
def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ...
def getResourceProfile(self) -> Optional[ResourceProfile]: ...
@overload
def toDF(
self: RDD[RowLike],
schema: Optional[List[str]] = ...,
sampleRatio: Optional[float] = ...,
) -> DataFrame: ...
@overload
def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ...
class RDDBarrier(Generic[T]):
rdd: RDD[T]
def __init__(self, rdd: RDD[T]) -> None: ...
def mapPartitions(
self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
) -> RDD[U]: ...
def mapPartitionsWithIndex(
self,
f: Callable[[int, Iterable[T]], Iterable[U]],
preservesPartitioning: bool = ...,
) -> RDD[U]: ...
class PipelinedRDD(RDD[U], Generic[T, U]):
func: Callable[[T], U]
preservesPartitioning: bool
is_cached: bool
is_checkpointed: bool
ctx: pyspark.context.SparkContext
prev: RDD[T]
partitioner: Optional[Partitioner]
is_barrier: bool
def __init__(
self,
prev: RDD[T],
func: Callable[[Iterable[T]], Iterable[U]],
preservesPartitioning: bool = ...,
isFromBarrier: bool = ...,
) -> None: ...
def getNumPartitions(self) -> int: ...
def id(self) -> int: ...