python/pyspark/mllib/tree.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import sys
 import random

 from pyspark import RDD, since
 from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.util import JavaLoader, JavaSaveable
 from typing import Dict, Optional, Tuple, Union, overload, TYPE_CHECKING
 from pyspark.core.rdd import RDD

 if TYPE_CHECKING:
     from pyspark.mllib._typing import VectorLike


 __all__ = [
     "DecisionTreeModel",
     "DecisionTree",
     "RandomForestModel",
     "RandomForest",
     "GradientBoostedTreesModel",
     "GradientBoostedTrees",
 ]


 class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
     """TreeEnsembleModel

     .. versionadded:: 1.3.0
     """

     @overload
     def predict(self, x: "VectorLike") -> float:
         ...

     @overload
     def predict(self, x: RDD["VectorLike"]) -> RDD[float]:
         ...

     def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
         """
         Predict values for a single data point or an RDD of points using
         the model trained.

         .. versionadded:: 1.3.0

         Notes
         -----
         In Python, predict cannot currently be used within an RDD
         transformation or action.
         Call predict directly on the RDD instead.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))

         else:
             return self.call("predict", _convert_to_vector(x))

     @since("1.3.0")
     def numTrees(self) -> int:
         """
         Get number of trees in ensemble.
         """
         return self.call("numTrees")

     @since("1.3.0")
     def totalNumNodes(self) -> int:
         """
         Get total number of nodes, summed over all trees in the ensemble.
         """
         return self.call("totalNumNodes")

     def __repr__(self) -> str:
         """Summary of model"""
         return self._java_model.toString()

     @since("1.3.0")
     def toDebugString(self) -> str:
         """Full model"""
         return self._java_model.toDebugString()


 class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader["DecisionTreeModel"]):
     """
     A decision tree model for classification or regression.

     .. versionadded:: 1.1.0
     """

     @overload
     def predict(self, x: "VectorLike") -> float:
         ...

     @overload
     def predict(self, x: RDD["VectorLike"]) -> RDD[float]:
         ...

     def predict(self, x: Union["VectorLike", RDD["VectorLike"]]) -> Union[float, RDD[float]]:
         """
         Predict the label of one or more examples.

         .. versionadded:: 1.1.0

         Parameters
         ----------
         x : :py:class:`pyspark.mllib.linalg.Vector` or :py:class:`pyspark.RDD`
             Data point (feature vector), or an RDD of data points (feature
             vectors).

         Notes
         -----
         In Python, predict cannot currently be used within an RDD
         transformation or action.
         Call predict directly on the RDD instead.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))

         else:
             return self.call("predict", _convert_to_vector(x))

     @since("1.1.0")
     def numNodes(self) -> int:
         """Get number of nodes in tree, including leaf nodes."""
         return self._java_model.numNodes()

     @since("1.1.0")
     def depth(self) -> int:
         """
         Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1
         means 1 internal node + 2 leaf nodes).
         """
         return self._java_model.depth()

     def __repr__(self) -> str:
         """summary of model."""
         return self._java_model.toString()

     @since("1.2.0")
     def toDebugString(self) -> str:
         """full model."""
         return self._java_model.toDebugString()

     @classmethod
     def _java_loader_class(cls) -> str:
         return "org.apache.spark.mllib.tree.model.DecisionTreeModel"


 class DecisionTree:
     """
     Learning algorithm for a decision tree model for classification or
     regression.

     .. versionadded:: 1.1.0
     """

     @classmethod
     def _train(
         cls,
         data: RDD[LabeledPoint],
         type: str,
         numClasses: int,
         features: Dict[int, int],
         impurity: str = "gini",
         maxDepth: int = 5,
         maxBins: int = 32,
         minInstancesPerNode: int = 1,
         minInfoGain: float = 0.0,
     ) -> DecisionTreeModel:
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
         model = callMLlibFunc(
             "trainDecisionTreeModel",
             data,
             type,
             numClasses,
             features,
             impurity,
             maxDepth,
             maxBins,
             minInstancesPerNode,
             minInfoGain,
         )
         return DecisionTreeModel(model)

     @classmethod
     def trainClassifier(
         cls,
         data: RDD[LabeledPoint],
         numClasses: int,
         categoricalFeaturesInfo: Dict[int, int],
         impurity: str = "gini",
         maxDepth: int = 5,
         maxBins: int = 32,
         minInstancesPerNode: int = 1,
         minInfoGain: float = 0.0,
     ) -> DecisionTreeModel:
         """
         Train a decision tree model for classification.

         .. versionadded:: 1.1.0

         Parameters
         ----------
         data :  :py:class:`pyspark.RDD`
             Training data: RDD of LabeledPoint. Labels should take values
             {0, 1, ..., numClasses-1}.
         numClasses : int
             Number of classes for classification.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         impurity : str, optional
             Criterion used for information gain calculation.
             Supported values: "gini" or "entropy".
             (default: "gini")
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 5)
         maxBins : int, optional
             Number of bins used for finding splits at each node.
             (default: 32)
         minInstancesPerNode : int, optional
             Minimum number of instances required at child nodes to create
             the parent split.
             (default: 1)
         minInfoGain : float, optional
             Minimum info gain required to create a split.
             (default: 0.0)

         Returns
         -------
         :py:class:`DecisionTreeModel`

         Examples
         --------
         >>> from numpy import array
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import DecisionTree
         >>>
         >>> data = [
         ...     LabeledPoint(0.0, [0.0]),
         ...     LabeledPoint(1.0, [1.0]),
         ...     LabeledPoint(1.0, [2.0]),
         ...     LabeledPoint(1.0, [3.0])
         ... ]
         >>> model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {})
         >>> print(model)
         DecisionTreeModel classifier of depth 1 with 3 nodes

         >>> print(model.toDebugString())
         DecisionTreeModel classifier of depth 1 with 3 nodes
           If (feature 0 <= 0.5)
            Predict: 0.0
           Else (feature 0 > 0.5)
            Predict: 1.0
         >>> model.predict(array([1.0]))
         1.0
         >>> model.predict(array([0.0]))
         0.0
         >>> rdd = sc.parallelize([[1.0], [0.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
         return cls._train(
             data,
             "classification",
             numClasses,
             categoricalFeaturesInfo,
             impurity,
             maxDepth,
             maxBins,
             minInstancesPerNode,
             minInfoGain,
         )

     @classmethod
     @since("1.1.0")
     def trainRegressor(
         cls,
         data: RDD[LabeledPoint],
         categoricalFeaturesInfo: Dict[int, int],
         impurity: str = "variance",
         maxDepth: int = 5,
         maxBins: int = 32,
         minInstancesPerNode: int = 1,
         minInfoGain: float = 0.0,
     ) -> DecisionTreeModel:
         """
         Train a decision tree model for regression.

         Parameters
         ----------
         data : :py:class:`pyspark.RDD`
             Training data: RDD of LabeledPoint. Labels are real numbers.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         impurity : str, optional
             Criterion used for information gain calculation.
             The only supported value for regression is "variance".
             (default: "variance")
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 5)
         maxBins : int, optional
             Number of bins used for finding splits at each node.
             (default: 32)
         minInstancesPerNode : int, optional
             Minimum number of instances required at child nodes to create
             the parent split.
             (default: 1)
         minInfoGain : float, optional
             Minimum info gain required to create a split.
             (default: 0.0)

         Returns
         -------
         :py:class:`DecisionTreeModel`

         Examples
         --------
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import DecisionTree
         >>> from pyspark.mllib.linalg import SparseVector
         >>>
         >>> sparse_data = [
         ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
         ...     LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
         ... ]
         >>>
         >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {})
         >>> model.predict(SparseVector(2, {1: 1.0}))
         1.0
         >>> model.predict(SparseVector(2, {1: 0.0}))
         0.0
         >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
         return cls._train(
             data,
             "regression",
             0,
             categoricalFeaturesInfo,
             impurity,
             maxDepth,
             maxBins,
             minInstancesPerNode,
             minInfoGain,
         )


 @inherit_doc
 class RandomForestModel(TreeEnsembleModel, JavaLoader["RandomForestModel"]):
     """
     Represents a random forest model.

     .. versionadded:: 1.2.0
     """

     @classmethod
     def _java_loader_class(cls) -> str:
         return "org.apache.spark.mllib.tree.model.RandomForestModel"


 class RandomForest:
     """
     Learning algorithm for a random forest model for classification or
     regression.

     .. versionadded:: 1.2.0
     """

     supportedFeatureSubsetStrategies: Tuple[str, ...] = ("auto", "all", "sqrt", "log2", "onethird")

     @classmethod
     def _train(
         cls,
         data: RDD[LabeledPoint],
         algo: str,
         numClasses: int,
         categoricalFeaturesInfo: Dict[int, int],
         numTrees: int,
         featureSubsetStrategy: str,
         impurity: str,
         maxDepth: int,
         maxBins: int,
         seed: Optional[int],
     ) -> RandomForestModel:
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
         if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
             raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
         if seed is None:
             seed = random.randint(0, 1 << 30)
         model = callMLlibFunc(
             "trainRandomForestModel",
             data,
             algo,
             numClasses,
             categoricalFeaturesInfo,
             numTrees,
             featureSubsetStrategy,
             impurity,
             maxDepth,
             maxBins,
             seed,
         )
         return RandomForestModel(model)

     @classmethod
     def trainClassifier(
         cls,
         data: RDD[LabeledPoint],
         numClasses: int,
         categoricalFeaturesInfo: Dict[int, int],
         numTrees: int,
         featureSubsetStrategy: str = "auto",
         impurity: str = "gini",
         maxDepth: int = 4,
         maxBins: int = 32,
         seed: Optional[int] = None,
     ) -> RandomForestModel:
         """
         Train a random forest model for binary or multiclass
         classification.

         .. versionadded:: 1.2.0

         Parameters
         ----------
         data : :py:class:`pyspark.RDD`
             Training dataset: RDD of LabeledPoint. Labels should take values
             {0, 1, ..., numClasses-1}.
         numClasses : int
             Number of classes for classification.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         numTrees : int
             Number of trees in the random forest.
         featureSubsetStrategy : str, optional
             Number of features to consider for splits at each node.
             Supported values: "auto", "all", "sqrt", "log2", "onethird".
             If "auto" is set, this parameter is set based on numTrees:
             if numTrees == 1, set to "all";
             if numTrees > 1 (forest) set to "sqrt".
             (default: "auto")
         impurity : str, optional
             Criterion used for information gain calculation.
             Supported values: "gini" or "entropy".
             (default: "gini")
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 4)
         maxBins : int, optional
             Maximum number of bins used for splitting features.
             (default: 32)
         seed : int, Optional
             Random seed for bootstrapping and choosing feature subsets.
             Set as None to generate seed based on system time.
             (default: None)

         Returns
         -------
         :py:class:`RandomForestModel`
             that can be used for prediction.

         Examples
         --------
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import RandomForest
         >>>
         >>> data = [
         ...     LabeledPoint(0.0, [0.0]),
         ...     LabeledPoint(0.0, [1.0]),
         ...     LabeledPoint(1.0, [2.0]),
         ...     LabeledPoint(1.0, [3.0])
         ... ]
         >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)
         >>> model.numTrees()
         3
         >>> model.totalNumNodes()
         7
         >>> print(model)
         TreeEnsembleModel classifier with 3 trees
         >>> print(model.toDebugString())
         TreeEnsembleModel classifier with 3 trees
           Tree 0:
             Predict: 1.0
           Tree 1:
             If (feature 0 <= 1.5)
              Predict: 0.0
             Else (feature 0 > 1.5)
              Predict: 1.0
           Tree 2:
             If (feature 0 <= 1.5)
              Predict: 0.0
             Else (feature 0 > 1.5)
              Predict: 1.0
         >>> model.predict([2.0])
         1.0
         >>> model.predict([0.0])
         0.0
         >>> rdd = sc.parallelize([[3.0], [1.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
         return cls._train(
             data,
             "classification",
             numClasses,
             categoricalFeaturesInfo,
             numTrees,
             featureSubsetStrategy,
             impurity,
             maxDepth,
             maxBins,
             seed,
         )

     @classmethod
     def trainRegressor(
         cls,
         data: RDD[LabeledPoint],
         categoricalFeaturesInfo: Dict[int, int],
         numTrees: int,
         featureSubsetStrategy: str = "auto",
         impurity: str = "variance",
         maxDepth: int = 4,
         maxBins: int = 32,
         seed: Optional[int] = None,
     ) -> RandomForestModel:
         """
         Train a random forest model for regression.

         .. versionadded:: 1.2.0

         Parameters
         ----------
         data : :py:class:`pyspark.RDD`
             Training dataset: RDD of LabeledPoint. Labels are real numbers.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         numTrees : int
             Number of trees in the random forest.
         featureSubsetStrategy : str, optional
             Number of features to consider for splits at each node.
             Supported values: "auto", "all", "sqrt", "log2", "onethird".
             If "auto" is set, this parameter is set based on numTrees:

             - if numTrees == 1, set to "all";
             - if numTrees > 1 (forest) set to "onethird" for regression.

             (default: "auto")
         impurity : str, optional
             Criterion used for information gain calculation.
             The only supported value for regression is "variance".
             (default: "variance")
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 4)
         maxBins : int, optional
             Maximum number of bins used for splitting features.
             (default: 32)
         seed : int, optional
             Random seed for bootstrapping and choosing feature subsets.
             Set as None to generate seed based on system time.
             (default: None)

         Returns
         -------
         :py:class:`RandomForestModel`
             that can be used for prediction.

         Examples
         --------
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import RandomForest
         >>> from pyspark.mllib.linalg import SparseVector
         >>>
         >>> sparse_data = [
         ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
         ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
         ... ]
         >>>
         >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)
         >>> model.numTrees()
         2
         >>> model.totalNumNodes()
         4
         >>> model.predict(SparseVector(2, {1: 1.0}))
         1.0
         >>> model.predict(SparseVector(2, {0: 1.0}))
         0.5
         >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.5]
         """
         return cls._train(
             data,
             "regression",
             0,
             categoricalFeaturesInfo,
             numTrees,
             featureSubsetStrategy,
             impurity,
             maxDepth,
             maxBins,
             seed,
         )


 @inherit_doc
 class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader["GradientBoostedTreesModel"]):
     """
     Represents a gradient-boosted tree model.

     .. versionadded:: 1.3.0
     """

     @classmethod
     def _java_loader_class(cls) -> str:
         return "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel"


 class GradientBoostedTrees:
     """
     Learning algorithm for a gradient boosted trees model for
     classification or regression.

     .. versionadded:: 1.3.0
     """

     @classmethod
     def _train(
         cls,
         data: RDD[LabeledPoint],
         algo: str,
         categoricalFeaturesInfo: Dict[int, int],
         loss: str,
         numIterations: int,
         learningRate: float,
         maxDepth: int,
         maxBins: int,
     ) -> GradientBoostedTreesModel:
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
         model = callMLlibFunc(
             "trainGradientBoostedTreesModel",
             data,
             algo,
             categoricalFeaturesInfo,
             loss,
             numIterations,
             learningRate,
             maxDepth,
             maxBins,
         )
         return GradientBoostedTreesModel(model)

     @classmethod
     def trainClassifier(
         cls,
         data: RDD[LabeledPoint],
         categoricalFeaturesInfo: Dict[int, int],
         loss: str = "logLoss",
         numIterations: int = 100,
         learningRate: float = 0.1,
         maxDepth: int = 3,
         maxBins: int = 32,
     ) -> GradientBoostedTreesModel:
         """
         Train a gradient-boosted trees model for classification.

         .. versionadded:: 1.3.0

         Parameters
         ----------
         data : :py:class:`pyspark.RDD`
             Training dataset: RDD of LabeledPoint. Labels should take values
             {0, 1}.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         loss : str, optional
             Loss function used for minimization during gradient boosting.
             Supported values: "logLoss", "leastSquaresError",
             "leastAbsoluteError".
             (default: "logLoss")
         numIterations : int, optional
             Number of iterations of boosting.
             (default: 100)
         learningRate : float, optional
             Learning rate for shrinking the contribution of each estimator.
             The learning rate should be between in the interval (0, 1].
             (default: 0.1)
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 3)
         maxBins : int, optional
             Maximum number of bins used for splitting features. DecisionTree
             requires maxBins >= max categories.
             (default: 32)

         Returns
         -------
         :py:class:`GradientBoostedTreesModel`
             that can be used for prediction.

         Examples
         --------
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import GradientBoostedTrees
         >>>
         >>> data = [
         ...     LabeledPoint(0.0, [0.0]),
         ...     LabeledPoint(0.0, [1.0]),
         ...     LabeledPoint(1.0, [2.0]),
         ...     LabeledPoint(1.0, [3.0])
         ... ]
         >>>
         >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}, numIterations=10)
         >>> model.numTrees()
         10
         >>> model.totalNumNodes()
         30
         >>> print(model)  # it already has newline
         TreeEnsembleModel classifier with 10 trees
         >>> model.predict([2.0])
         1.0
         >>> model.predict([0.0])
         0.0
         >>> rdd = sc.parallelize([[2.0], [0.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
         return cls._train(
             data,
             "classification",
             categoricalFeaturesInfo,
             loss,
             numIterations,
             learningRate,
             maxDepth,
             maxBins,
         )

     @classmethod
     def trainRegressor(
         cls,
         data: RDD[LabeledPoint],
         categoricalFeaturesInfo: Dict[int, int],
         loss: str = "leastSquaresError",
         numIterations: int = 100,
         learningRate: float = 0.1,
         maxDepth: int = 3,
         maxBins: int = 32,
     ) -> GradientBoostedTreesModel:
         """
         Train a gradient-boosted trees model for regression.

         .. versionadded:: 1.3.0

         Parameters
         ----------
         data :
             Training dataset: RDD of LabeledPoint. Labels are real numbers.
         categoricalFeaturesInfo : dict
             Map storing arity of categorical features. An entry (n -> k)
             indicates that feature n is categorical with k categories
             indexed from 0: {0, 1, ..., k-1}.
         loss : str, optional
             Loss function used for minimization during gradient boosting.
             Supported values: "logLoss", "leastSquaresError",
             "leastAbsoluteError".
             (default: "leastSquaresError")
         numIterations : int, optional
             Number of iterations of boosting.
             (default: 100)
         learningRate : float, optional
             Learning rate for shrinking the contribution of each estimator.
             The learning rate should be between in the interval (0, 1].
             (default: 0.1)
         maxDepth : int, optional
             Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1
             means 1 internal node + 2 leaf nodes).
             (default: 3)
         maxBins : int, optional
             Maximum number of bins used for splitting features. DecisionTree
             requires maxBins >= max categories.
             (default: 32)

         Returns
         -------
         :py:class:`GradientBoostedTreesModel`
             that can be used for prediction.

         Examples
         --------
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import GradientBoostedTrees
         >>> from pyspark.mllib.linalg import SparseVector
         >>>
         >>> sparse_data = [
         ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
         ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
         ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
         ... ]
         >>>
         >>> data = sc.parallelize(sparse_data)
         >>> model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10)
         >>> model.numTrees()
         10
         >>> model.totalNumNodes()
         12
         >>> model.predict(SparseVector(2, {1: 1.0}))
         1.0
         >>> model.predict(SparseVector(2, {0: 1.0}))
         0.0
         >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
         return cls._train(
             data,
             "regression",
             categoricalFeaturesInfo,
             loss,
             numIterations,
             learningRate,
             maxDepth,
             maxBins,
         )


 def _test() -> None:
     import doctest

     globs = globals().copy()
     from pyspark.sql import SparkSession

     spark = SparkSession.builder.master("local[4]").appName("mllib.tree tests").getOrCreate()
     globs["sc"] = spark.sparkContext
     (failure_count, test_count) = doctest.testmod(
         globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
     )
     spark.stop()
     if failure_count:
         sys.exit(-1)


 if __name__ == "__main__":
     _test()