python/pyspark/rdd.pyi - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from typing import overload
 from typing import (
     Any,
     Callable,
     Dict,
     Generic,
     Hashable,
     Iterable,
     Iterator,
     List,
     Optional,
     Tuple,
     Union,
     TypeVar,
 )
 from typing_extensions import Literal

 from numpy import int32, int64, float32, float64, ndarray  # type: ignore[import]

 from pyspark._typing import SupportsOrdering
 from pyspark.sql.pandas._typing import (
     PandasScalarUDFType,
     PandasScalarIterUDFType,
     PandasGroupedMapUDFType,
     PandasCogroupedMapUDFType,
     PandasGroupedAggUDFType,
     PandasMapIterUDFType,
 )
 import pyspark.context
 from pyspark.resultiterable import ResultIterable
 from pyspark.serializers import Serializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.resource.requests import (  # noqa: F401
     ExecutorResourceRequests,
     TaskResourceRequests,
 )
 from pyspark.resource.profile import ResourceProfile
 from pyspark.statcounter import StatCounter
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.types import StructType
 from pyspark.sql._typing import RowLike
 from py4j.java_gateway import JavaObject  # type: ignore[import]

 T = TypeVar("T")
 U = TypeVar("U")
 K = TypeVar("K", bound=Hashable)
 V = TypeVar("V")
 V1 = TypeVar("V1")
 V2 = TypeVar("V2")
 V3 = TypeVar("V3")
 O = TypeVar("O", bound=SupportsOrdering)
 NumberOrArray = TypeVar(
     "NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray
 )

 def portable_hash(x: Hashable) -> int: ...

 class PythonEvalType:
     NON_UDF: Literal[0]
     SQL_BATCHED_UDF: Literal[100]
     SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType
     SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType
     SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType
     SQL_WINDOW_AGG_PANDAS_UDF: Literal[203]
     SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType
     SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType
     SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType

 class BoundedFloat(float):
     def __new__(
         cls, mean: float, confidence: float, low: float, high: float
     ) -> BoundedFloat: ...

 class Partitioner:
     numPartitions: int
     partitionFunc: Callable[[Any], int]
     def __init__(
         self, numPartitions: int, partitionFunc: Callable[[Any], int]
     ) -> None: ...
     def __eq__(self, other: Any) -> bool: ...
     def __call__(self, k: Any) -> int: ...

 class RDD(Generic[T]):
     is_cached: bool
     is_checkpointed: bool
     ctx: pyspark.context.SparkContext
     partitioner: Optional[Partitioner]
     def __init__(
         self,
         jrdd: JavaObject,
         ctx: pyspark.context.SparkContext,
         jrdd_deserializer: Serializer = ...,
     ) -> None: ...
     def id(self) -> int: ...
     def __getnewargs__(self) -> Any: ...
     @property
     def context(self) -> pyspark.context.SparkContext: ...
     def cache(self) -> RDD[T]: ...
     def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ...
     def unpersist(self, blocking: bool = ...) -> RDD[T]: ...
     def checkpoint(self) -> None: ...
     def isCheckpointed(self) -> bool: ...
     def localCheckpoint(self) -> None: ...
     def isLocallyCheckpointed(self) -> bool: ...
     def getCheckpointFile(self) -> Optional[str]: ...
     def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ...
     def flatMap(
         self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ...
     ) -> RDD[U]: ...
     def mapPartitions(
         self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
     ) -> RDD[U]: ...
     def mapPartitionsWithIndex(
         self,
         f: Callable[[int, Iterable[T]], Iterable[U]],
         preservesPartitioning: bool = ...,
     ) -> RDD[U]: ...
     def mapPartitionsWithSplit(
         self,
         f: Callable[[int, Iterable[T]], Iterable[U]],
         preservesPartitioning: bool = ...,
     ) -> RDD[U]: ...
     def getNumPartitions(self) -> int: ...
     def filter(self, f: Callable[[T], bool]) -> RDD[T]: ...
     def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ...
     def sample(
         self, withReplacement: bool, fraction: float, seed: Optional[int] = ...
     ) -> RDD[T]: ...
     def randomSplit(
         self, weights: List[Union[int, float]], seed: Optional[int] = ...
     ) -> List[RDD[T]]: ...
     def takeSample(
         self, withReplacement: bool, num: int, seed: Optional[int] = ...
     ) -> List[T]: ...
     def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ...
     def intersection(self, other: RDD[T]) -> RDD[T]: ...
     def __add__(self, other: RDD[T]) -> RDD[T]: ...
     @overload
     def repartitionAndSortWithinPartitions(
         self: RDD[Tuple[O, V]],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[O], int] = ...,
         ascending: bool = ...,
     ) -> RDD[Tuple[O, V]]: ...
     @overload
     def repartitionAndSortWithinPartitions(
         self: RDD[Tuple[K, V]],
         numPartitions: Optional[int],
         partitionFunc: Callable[[K], int],
         ascending: bool,
         keyfunc: Callable[[K], O],
     ) -> RDD[Tuple[K, V]]: ...
     @overload
     def repartitionAndSortWithinPartitions(
         self: RDD[Tuple[K, V]],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
         ascending: bool = ...,
         *,
         keyfunc: Callable[[K], O]
     ) -> RDD[Tuple[K, V]]: ...
     @overload
     def sortByKey(
         self: RDD[Tuple[O, V]],
         ascending: bool = ...,
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     @overload
     def sortByKey(
         self: RDD[Tuple[K, V]],
         ascending: bool,
         numPartitions: int,
         keyfunc: Callable[[K], O],
     ) -> RDD[Tuple[K, V]]: ...
     @overload
     def sortByKey(
         self: RDD[Tuple[K, V]],
         ascending: bool = ...,
         numPartitions: Optional[int] = ...,
         *,
         keyfunc: Callable[[K], O]
     ) -> RDD[Tuple[K, V]]: ...
     def sortBy(
         self: RDD[T],
         keyfunc: Callable[[T], O],
         ascending: bool = ...,
         numPartitions: Optional[int] = ...,
     ) -> RDD[T]: ...
     def glom(self) -> RDD[List[T]]: ...
     def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
     def groupBy(
         self,
         f: Callable[[T], K],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, Iterable[T]]]: ...
     def pipe(
         self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ...
     ) -> RDD[str]: ...
     def foreach(self, f: Callable[[T], None]) -> None: ...
     def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ...
     def collect(self) -> List[T]: ...
     def collectWithJobGroup(
         self, groupId: str, description: str, interruptOnCancel: bool = ...
     ) -> List[T]: ...
     def reduce(self, f: Callable[[T, T], T]) -> T: ...
     def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ...
     def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ...
     def aggregate(
         self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]
     ) -> U: ...
     def treeAggregate(
         self,
         zeroValue: U,
         seqOp: Callable[[U, T], U],
         combOp: Callable[[U, U], U],
         depth: int = ...,
     ) -> U: ...
     @overload
     def max(self: RDD[O]) -> O: ...
     @overload
     def max(self, key: Callable[[T], O]) -> T: ...
     @overload
     def min(self: RDD[O]) -> O: ...
     @overload
     def min(self, key: Callable[[T], O]) -> T: ...
     def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def count(self) -> int: ...
     def stats(self: RDD[NumberOrArray]) -> StatCounter: ...
     def histogram(self, buckets: Union[int, List[T], Tuple[T, ...]]) -> Tuple[List[T], List[int]]: ...
     def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
     def countByValue(self: RDD[K]) -> Dict[K, int]: ...
     @overload
     def top(self: RDD[O], num: int) -> List[O]: ...
     @overload
     def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
     @overload
     def takeOrdered(self: RDD[O], num: int) -> List[O]: ...
     @overload
     def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
     def take(self, num: int) -> List[T]: ...
     def first(self) -> T: ...
     def isEmpty(self) -> bool: ...
     def saveAsNewAPIHadoopDataset(
         self: RDD[Tuple[K, V]],
         conf: Dict[str, str],
         keyConverter: Optional[str] = ...,
         valueConverter: Optional[str] = ...,
     ) -> None: ...
     def saveAsNewAPIHadoopFile(
         self: RDD[Tuple[K, V]],
         path: str,
         outputFormatClass: str,
         keyClass: Optional[str] = ...,
         valueClass: Optional[str] = ...,
         keyConverter: Optional[str] = ...,
         valueConverter: Optional[str] = ...,
         conf: Optional[Dict[str, str]] = ...,
     ) -> None: ...
     def saveAsHadoopDataset(
         self: RDD[Tuple[K, V]],
         conf: Dict[str, str],
         keyConverter: Optional[str] = ...,
         valueConverter: Optional[str] = ...,
     ) -> None: ...
     def saveAsHadoopFile(
         self: RDD[Tuple[K, V]],
         path: str,
         outputFormatClass: str,
         keyClass: Optional[str] = ...,
         valueClass: Optional[str] = ...,
         keyConverter: Optional[str] = ...,
         valueConverter: Optional[str] = ...,
         conf: Optional[str] = ...,
         compressionCodecClass: Optional[str] = ...,
     ) -> None: ...
     def saveAsSequenceFile(
         self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ...
     ) -> None: ...
     def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ...
     def saveAsTextFile(
         self, path: str, compressionCodecClass: Optional[str] = ...
     ) -> None: ...
     def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ...
     def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ...
     def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ...
     def reduceByKey(
         self: RDD[Tuple[K, V]],
         func: Callable[[V, V], V],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     def reduceByKeyLocally(
         self: RDD[Tuple[K, V]], func: Callable[[V, V], V]
     ) -> Dict[K, V]: ...
     def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ...
     def join(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, Tuple[V, U]]]: ...
     def leftOuterJoin(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ...
     def rightOuterJoin(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ...
     def fullOuterJoin(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ...
     def partitionBy(
         self: RDD[Tuple[K, V]],
         numPartitions: int,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     def combineByKey(
         self: RDD[Tuple[K, V]],
         createCombiner: Callable[[V], U],
         mergeValue: Callable[[U, V], U],
         mergeCombiners: Callable[[U, U], U],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, U]]: ...
     def aggregateByKey(
         self: RDD[Tuple[K, V]],
         zeroValue: U,
         seqFunc: Callable[[U, V], U],
         combFunc: Callable[[U, U], U],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, U]]: ...
     def foldByKey(
         self: RDD[Tuple[K, V]],
         zeroValue: V,
         func: Callable[[V, V], V],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     def groupByKey(
         self: RDD[Tuple[K, V]],
         numPartitions: Optional[int] = ...,
         partitionFunc: Callable[[K], int] = ...,
     ) -> RDD[Tuple[K, Iterable[V]]]: ...
     def flatMapValues(
         self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]]
     ) -> RDD[Tuple[K, U]]: ...
     def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ...
     @overload
     def groupWith(
         self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]]
     ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ...
     @overload
     def groupWith(
         self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]]
     ) -> RDD[
         Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]
     ]: ...
     @overload
     def groupWith(
         self: RDD[Tuple[K, V]],
         other1: RDD[Tuple[K, V1]],
         other2: RDD[Tuple[K, V2]],
         other3: RDD[Tuple[K, V3]],
     ) -> RDD[
         Tuple[
             K,
             Tuple[
                 ResultIterable[V],
                 ResultIterable[V1],
                 ResultIterable[V2],
                 ResultIterable[V3],
             ],
         ]
     ]: ...
     def cogroup(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ...
     def sampleByKey(
         self: RDD[Tuple[K, V]],
         withReplacement: bool,
         fractions: Dict[K, Union[float, int]],
         seed: Optional[int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     def subtractByKey(
         self: RDD[Tuple[K, V]],
         other: RDD[Tuple[K, U]],
         numPartitions: Optional[int] = ...,
     ) -> RDD[Tuple[K, V]]: ...
     def subtract(
         self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ...
     ) -> RDD[T]: ...
     def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ...
     def repartition(self, numPartitions: int) -> RDD[T]: ...
     def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ...
     def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
     def zipWithIndex(self) -> RDD[Tuple[T, int]]: ...
     def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ...
     def name(self) -> str: ...
     def setName(self, name: str) -> RDD[T]: ...
     def toDebugString(self) -> bytes: ...
     def getStorageLevel(self) -> StorageLevel: ...
     def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ...
     def countApprox(self, timeout: int, confidence: float = ...) -> int: ...
     def sumApprox(
         self: RDD[Union[float, int]], timeout: int, confidence: float = ...
     ) -> BoundedFloat: ...
     def meanApprox(
         self: RDD[Union[float, int]], timeout: int, confidence: float = ...
     ) -> BoundedFloat: ...
     def countApproxDistinct(self, relativeSD: float = ...) -> int: ...
     def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ...
     def barrier(self: RDD[T]) -> RDDBarrier[T]: ...
     def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ...
     def getResourceProfile(self) -> Optional[ResourceProfile]: ...
     @overload
     def toDF(
         self: RDD[RowLike],
         schema: Optional[List[str]] = ...,
         sampleRatio: Optional[float] = ...,
     ) -> DataFrame: ...
     @overload
     def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ...

 class RDDBarrier(Generic[T]):
     rdd: RDD[T]
     def __init__(self, rdd: RDD[T]) -> None: ...
     def mapPartitions(
         self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
     ) -> RDD[U]: ...
     def mapPartitionsWithIndex(
         self,
         f: Callable[[int, Iterable[T]], Iterable[U]],
         preservesPartitioning: bool = ...,
     ) -> RDD[U]: ...

 class PipelinedRDD(RDD[U], Generic[T, U]):
     func: Callable[[T], U]
     preservesPartitioning: bool
     is_cached: bool
     is_checkpointed: bool
     ctx: pyspark.context.SparkContext
     prev: RDD[T]
     partitioner: Optional[Partitioner]
     is_barrier: bool
     def __init__(
         self,
         prev: RDD[T],
         func: Callable[[Iterable[T]], Iterable[U]],
         preservesPartitioning: bool = ...,
         isFromBarrier: bool = ...,
     ) -> None: ...
     def getNumPartitions(self) -> int: ...
     def id(self) -> int: ...
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from typing import overload
	from typing import (
	Any,
	Callable,
	Dict,
	Generic,
	Hashable,
	Iterable,
	Iterator,
	List,
	Optional,
	Tuple,
	Union,
	TypeVar,
	)
	from typing_extensions import Literal

	from numpy import int32, int64, float32, float64, ndarray # type: ignore[import]

	from pyspark._typing import SupportsOrdering
	from pyspark.sql.pandas._typing import (
	PandasScalarUDFType,
	PandasScalarIterUDFType,
	PandasGroupedMapUDFType,
	PandasCogroupedMapUDFType,
	PandasGroupedAggUDFType,
	PandasMapIterUDFType,
	)
	import pyspark.context
	from pyspark.resultiterable import ResultIterable
	from pyspark.serializers import Serializer
	from pyspark.storagelevel import StorageLevel
	from pyspark.resource.requests import ( # noqa: F401
	ExecutorResourceRequests,
	TaskResourceRequests,
	)
	from pyspark.resource.profile import ResourceProfile
	from pyspark.statcounter import StatCounter
	from pyspark.sql.dataframe import DataFrame
	from pyspark.sql.types import StructType
	from pyspark.sql._typing import RowLike
	from py4j.java_gateway import JavaObject # type: ignore[import]

	T = TypeVar("T")
	U = TypeVar("U")
	K = TypeVar("K", bound=Hashable)
	V = TypeVar("V")
	V1 = TypeVar("V1")
	V2 = TypeVar("V2")
	V3 = TypeVar("V3")
	O = TypeVar("O", bound=SupportsOrdering)
	NumberOrArray = TypeVar(
	"NumberOrArray", float, int, complex, int32, int64, float32, float64, ndarray
	)

	def portable_hash(x: Hashable) -> int: ...

	class PythonEvalType:
	NON_UDF: Literal[0]
	SQL_BATCHED_UDF: Literal[100]
	SQL_SCALAR_PANDAS_UDF: PandasScalarUDFType
	SQL_GROUPED_MAP_PANDAS_UDF: PandasGroupedMapUDFType
	SQL_GROUPED_AGG_PANDAS_UDF: PandasGroupedAggUDFType
	SQL_WINDOW_AGG_PANDAS_UDF: Literal[203]
	SQL_SCALAR_PANDAS_ITER_UDF: PandasScalarIterUDFType
	SQL_MAP_PANDAS_ITER_UDF: PandasMapIterUDFType
	SQL_COGROUPED_MAP_PANDAS_UDF: PandasCogroupedMapUDFType

	class BoundedFloat(float):
	def __new__(
	cls, mean: float, confidence: float, low: float, high: float
	) -> BoundedFloat: ...

	class Partitioner:
	numPartitions: int
	partitionFunc: Callable[[Any], int]
	def __init__(
	self, numPartitions: int, partitionFunc: Callable[[Any], int]
	) -> None: ...
	def __eq__(self, other: Any) -> bool: ...
	def __call__(self, k: Any) -> int: ...

	class RDD(Generic[T]):
	is_cached: bool
	is_checkpointed: bool
	ctx: pyspark.context.SparkContext
	partitioner: Optional[Partitioner]
	def __init__(
	self,
	jrdd: JavaObject,
	ctx: pyspark.context.SparkContext,
	jrdd_deserializer: Serializer = ...,
	) -> None: ...
	def id(self) -> int: ...
	def __getnewargs__(self) -> Any: ...
	@property
	def context(self) -> pyspark.context.SparkContext: ...
	def cache(self) -> RDD[T]: ...
	def persist(self, storageLevel: StorageLevel = ...) -> RDD[T]: ...
	def unpersist(self, blocking: bool = ...) -> RDD[T]: ...
	def checkpoint(self) -> None: ...
	def isCheckpointed(self) -> bool: ...
	def localCheckpoint(self) -> None: ...
	def isLocallyCheckpointed(self) -> bool: ...
	def getCheckpointFile(self) -> Optional[str]: ...
	def map(self, f: Callable[[T], U], preservesPartitioning: bool = ...) -> RDD[U]: ...
	def flatMap(
	self, f: Callable[[T], Iterable[U]], preservesPartitioning: bool = ...
	) -> RDD[U]: ...
	def mapPartitions(
	self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
	) -> RDD[U]: ...
	def mapPartitionsWithIndex(
	self,
	f: Callable[[int, Iterable[T]], Iterable[U]],
	preservesPartitioning: bool = ...,
	) -> RDD[U]: ...
	def mapPartitionsWithSplit(
	self,
	f: Callable[[int, Iterable[T]], Iterable[U]],
	preservesPartitioning: bool = ...,
	) -> RDD[U]: ...
	def getNumPartitions(self) -> int: ...
	def filter(self, f: Callable[[T], bool]) -> RDD[T]: ...
	def distinct(self, numPartitions: Optional[int] = ...) -> RDD[T]: ...
	def sample(
	self, withReplacement: bool, fraction: float, seed: Optional[int] = ...
	) -> RDD[T]: ...
	def randomSplit(
	self, weights: List[Union[int, float]], seed: Optional[int] = ...
	) -> List[RDD[T]]: ...
	def takeSample(
	self, withReplacement: bool, num: int, seed: Optional[int] = ...
	) -> List[T]: ...
	def union(self, other: RDD[U]) -> RDD[Union[T, U]]: ...
	def intersection(self, other: RDD[T]) -> RDD[T]: ...
	def __add__(self, other: RDD[T]) -> RDD[T]: ...
	@overload
	def repartitionAndSortWithinPartitions(
	self: RDD[Tuple[O, V]],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[O], int] = ...,
	ascending: bool = ...,
	) -> RDD[Tuple[O, V]]: ...
	@overload
	def repartitionAndSortWithinPartitions(
	self: RDD[Tuple[K, V]],
	numPartitions: Optional[int],
	partitionFunc: Callable[[K], int],
	ascending: bool,
	keyfunc: Callable[[K], O],
	) -> RDD[Tuple[K, V]]: ...
	@overload
	def repartitionAndSortWithinPartitions(
	self: RDD[Tuple[K, V]],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	ascending: bool = ...,
	*,
	keyfunc: Callable[[K], O]
	) -> RDD[Tuple[K, V]]: ...
	@overload
	def sortByKey(
	self: RDD[Tuple[O, V]],
	ascending: bool = ...,
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	@overload
	def sortByKey(
	self: RDD[Tuple[K, V]],
	ascending: bool,
	numPartitions: int,
	keyfunc: Callable[[K], O],
	) -> RDD[Tuple[K, V]]: ...
	@overload
	def sortByKey(
	self: RDD[Tuple[K, V]],
	ascending: bool = ...,
	numPartitions: Optional[int] = ...,
	*,
	keyfunc: Callable[[K], O]
	) -> RDD[Tuple[K, V]]: ...
	def sortBy(
	self: RDD[T],
	keyfunc: Callable[[T], O],
	ascending: bool = ...,
	numPartitions: Optional[int] = ...,
	) -> RDD[T]: ...
	def glom(self) -> RDD[List[T]]: ...
	def cartesian(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
	def groupBy(
	self,
	f: Callable[[T], K],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, Iterable[T]]]: ...
	def pipe(
	self, command: str, env: Optional[Dict[str, str]] = ..., checkCode: bool = ...
	) -> RDD[str]: ...
	def foreach(self, f: Callable[[T], None]) -> None: ...
	def foreachPartition(self, f: Callable[[Iterable[T]], None]) -> None: ...
	def collect(self) -> List[T]: ...
	def collectWithJobGroup(
	self, groupId: str, description: str, interruptOnCancel: bool = ...
	) -> List[T]: ...
	def reduce(self, f: Callable[[T, T], T]) -> T: ...
	def treeReduce(self, f: Callable[[T, T], T], depth: int = ...) -> T: ...
	def fold(self, zeroValue: T, op: Callable[[T, T], T]) -> T: ...
	def aggregate(
	self, zeroValue: U, seqOp: Callable[[U, T], U], combOp: Callable[[U, U], U]
	) -> U: ...
	def treeAggregate(
	self,
	zeroValue: U,
	seqOp: Callable[[U, T], U],
	combOp: Callable[[U, U], U],
	depth: int = ...,
	) -> U: ...
	@overload
	def max(self: RDD[O]) -> O: ...
	@overload
	def max(self, key: Callable[[T], O]) -> T: ...
	@overload
	def min(self: RDD[O]) -> O: ...
	@overload
	def min(self, key: Callable[[T], O]) -> T: ...
	def sum(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def count(self) -> int: ...
	def stats(self: RDD[NumberOrArray]) -> StatCounter: ...
	def histogram(self, buckets: Union[int, List[T], Tuple[T, ...]]) -> Tuple[List[T], List[int]]: ...
	def mean(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def variance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def stdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def sampleStdev(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def sampleVariance(self: RDD[NumberOrArray]) -> NumberOrArray: ...
	def countByValue(self: RDD[K]) -> Dict[K, int]: ...
	@overload
	def top(self: RDD[O], num: int) -> List[O]: ...
	@overload
	def top(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
	@overload
	def takeOrdered(self: RDD[O], num: int) -> List[O]: ...
	@overload
	def takeOrdered(self: RDD[T], num: int, key: Callable[[T], O]) -> List[T]: ...
	def take(self, num: int) -> List[T]: ...
	def first(self) -> T: ...
	def isEmpty(self) -> bool: ...
	def saveAsNewAPIHadoopDataset(
	self: RDD[Tuple[K, V]],
	conf: Dict[str, str],
	keyConverter: Optional[str] = ...,
	valueConverter: Optional[str] = ...,
	) -> None: ...
	def saveAsNewAPIHadoopFile(
	self: RDD[Tuple[K, V]],
	path: str,
	outputFormatClass: str,
	keyClass: Optional[str] = ...,
	valueClass: Optional[str] = ...,
	keyConverter: Optional[str] = ...,
	valueConverter: Optional[str] = ...,
	conf: Optional[Dict[str, str]] = ...,
	) -> None: ...
	def saveAsHadoopDataset(
	self: RDD[Tuple[K, V]],
	conf: Dict[str, str],
	keyConverter: Optional[str] = ...,
	valueConverter: Optional[str] = ...,
	) -> None: ...
	def saveAsHadoopFile(
	self: RDD[Tuple[K, V]],
	path: str,
	outputFormatClass: str,
	keyClass: Optional[str] = ...,
	valueClass: Optional[str] = ...,
	keyConverter: Optional[str] = ...,
	valueConverter: Optional[str] = ...,
	conf: Optional[str] = ...,
	compressionCodecClass: Optional[str] = ...,
	) -> None: ...
	def saveAsSequenceFile(
	self: RDD[Tuple[K, V]], path: str, compressionCodecClass: Optional[str] = ...
	) -> None: ...
	def saveAsPickleFile(self, path: str, batchSize: int = ...) -> None: ...
	def saveAsTextFile(
	self, path: str, compressionCodecClass: Optional[str] = ...
	) -> None: ...
	def collectAsMap(self: RDD[Tuple[K, V]]) -> Dict[K, V]: ...
	def keys(self: RDD[Tuple[K, V]]) -> RDD[K]: ...
	def values(self: RDD[Tuple[K, V]]) -> RDD[V]: ...
	def reduceByKey(
	self: RDD[Tuple[K, V]],
	func: Callable[[V, V], V],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	def reduceByKeyLocally(
	self: RDD[Tuple[K, V]], func: Callable[[V, V], V]
	) -> Dict[K, V]: ...
	def countByKey(self: RDD[Tuple[K, V]]) -> Dict[K, int]: ...
	def join(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, Tuple[V, U]]]: ...
	def leftOuterJoin(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, Tuple[V, Optional[U]]]]: ...
	def rightOuterJoin(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, Tuple[Optional[V], U]]]: ...
	def fullOuterJoin(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, Tuple[Optional[V], Optional[U]]]]: ...
	def partitionBy(
	self: RDD[Tuple[K, V]],
	numPartitions: int,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	def combineByKey(
	self: RDD[Tuple[K, V]],
	createCombiner: Callable[[V], U],
	mergeValue: Callable[[U, V], U],
	mergeCombiners: Callable[[U, U], U],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, U]]: ...
	def aggregateByKey(
	self: RDD[Tuple[K, V]],
	zeroValue: U,
	seqFunc: Callable[[U, V], U],
	combFunc: Callable[[U, U], U],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, U]]: ...
	def foldByKey(
	self: RDD[Tuple[K, V]],
	zeroValue: V,
	func: Callable[[V, V], V],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	def groupByKey(
	self: RDD[Tuple[K, V]],
	numPartitions: Optional[int] = ...,
	partitionFunc: Callable[[K], int] = ...,
	) -> RDD[Tuple[K, Iterable[V]]]: ...
	def flatMapValues(
	self: RDD[Tuple[K, V]], f: Callable[[V], Iterable[U]]
	) -> RDD[Tuple[K, U]]: ...
	def mapValues(self: RDD[Tuple[K, V]], f: Callable[[V], U]) -> RDD[Tuple[K, U]]: ...
	@overload
	def groupWith(
	self: RDD[Tuple[K, V]], __o: RDD[Tuple[K, V1]]
	) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1]]]]: ...
	@overload
	def groupWith(
	self: RDD[Tuple[K, V]], __o1: RDD[Tuple[K, V1]], __o2: RDD[Tuple[K, V2]]
	) -> RDD[
	Tuple[K, Tuple[ResultIterable[V], ResultIterable[V1], ResultIterable[V2]]]
	]: ...
	@overload
	def groupWith(
	self: RDD[Tuple[K, V]],
	other1: RDD[Tuple[K, V1]],
	other2: RDD[Tuple[K, V2]],
	other3: RDD[Tuple[K, V3]],
	) -> RDD[
	Tuple[
	K,
	Tuple[
	ResultIterable[V],
	ResultIterable[V1],
	ResultIterable[V2],
	ResultIterable[V3],
	],
	]
	]: ...
	def cogroup(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, Tuple[ResultIterable[V], ResultIterable[U]]]]: ...
	def sampleByKey(
	self: RDD[Tuple[K, V]],
	withReplacement: bool,
	fractions: Dict[K, Union[float, int]],
	seed: Optional[int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	def subtractByKey(
	self: RDD[Tuple[K, V]],
	other: RDD[Tuple[K, U]],
	numPartitions: Optional[int] = ...,
	) -> RDD[Tuple[K, V]]: ...
	def subtract(
	self: RDD[T], other: RDD[T], numPartitions: Optional[int] = ...
	) -> RDD[T]: ...
	def keyBy(self: RDD[T], f: Callable[[T], K]) -> RDD[Tuple[K, T]]: ...
	def repartition(self, numPartitions: int) -> RDD[T]: ...
	def coalesce(self, numPartitions: int, shuffle: bool = ...) -> RDD[T]: ...
	def zip(self, other: RDD[U]) -> RDD[Tuple[T, U]]: ...
	def zipWithIndex(self) -> RDD[Tuple[T, int]]: ...
	def zipWithUniqueId(self) -> RDD[Tuple[T, int]]: ...
	def name(self) -> str: ...
	def setName(self, name: str) -> RDD[T]: ...
	def toDebugString(self) -> bytes: ...
	def getStorageLevel(self) -> StorageLevel: ...
	def lookup(self: RDD[Tuple[K, V]], key: K) -> List[V]: ...
	def countApprox(self, timeout: int, confidence: float = ...) -> int: ...
	def sumApprox(
	self: RDD[Union[float, int]], timeout: int, confidence: float = ...
	) -> BoundedFloat: ...
	def meanApprox(
	self: RDD[Union[float, int]], timeout: int, confidence: float = ...
	) -> BoundedFloat: ...
	def countApproxDistinct(self, relativeSD: float = ...) -> int: ...
	def toLocalIterator(self, prefetchPartitions: bool = ...) -> Iterator[T]: ...
	def barrier(self: RDD[T]) -> RDDBarrier[T]: ...
	def withResources(self: RDD[T], profile: ResourceProfile) -> RDD[T]: ...
	def getResourceProfile(self) -> Optional[ResourceProfile]: ...
	@overload
	def toDF(
	self: RDD[RowLike],
	schema: Optional[List[str]] = ...,
	sampleRatio: Optional[float] = ...,
	) -> DataFrame: ...
	@overload
	def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ...

	class RDDBarrier(Generic[T]):
	rdd: RDD[T]
	def __init__(self, rdd: RDD[T]) -> None: ...
	def mapPartitions(
	self, f: Callable[[Iterable[T]], Iterable[U]], preservesPartitioning: bool = ...
	) -> RDD[U]: ...
	def mapPartitionsWithIndex(
	self,
	f: Callable[[int, Iterable[T]], Iterable[U]],
	preservesPartitioning: bool = ...,
	) -> RDD[U]: ...

	class PipelinedRDD(RDD[U], Generic[T, U]):
	func: Callable[[T], U]
	preservesPartitioning: bool
	is_cached: bool
	is_checkpointed: bool
	ctx: pyspark.context.SparkContext
	prev: RDD[T]
	partitioner: Optional[Partitioner]
	is_barrier: bool
	def __init__(
	self,
	prev: RDD[T],
	func: Callable[[Iterable[T]], Iterable[U]],
	preservesPartitioning: bool = ...,
	isFromBarrier: bool = ...,
	) -> None: ...
	def getNumPartitions(self) -> int: ...
	def id(self) -> int: ...