python/pyspark/ml/regression.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import warnings

 from pyspark import since
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
 from pyspark.ml.param.shared import *
 from pyspark.mllib.common import inherit_doc


 __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel',
            'DecisionTreeRegressor', 'DecisionTreeRegressionModel',
            'GBTRegressor', 'GBTRegressionModel',
            'IsotonicRegression', 'IsotonicRegressionModel',
            'LinearRegression', 'LinearRegressionModel',
            'RandomForestRegressor', 'RandomForestRegressionModel']


 @inherit_doc
 class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
                        HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
                        HasStandardization, HasSolver, HasWeightCol):
     """
     Linear regression.

     The learning objective is to minimize the squared error, with regularization.
     The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^

     This support multiple types of regularization:
      - none (a.k.a. ordinary least squares)
      - L2 (ridge regression)
      - L1 (Lasso)
      - L2 + L1 (elastic net)

     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, 2.0, Vectors.dense(1.0)),
     ...     (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"])
     >>> lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight")
     >>> model = lr.fit(df)
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
     True
     >>> abs(model.coefficients[0] - 1.0) < 0.001
     True
     >>> abs(model.intercept - 0.0) < 0.001
     True
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> abs(model.transform(test1).head().prediction - 1.0) < 0.001
     True
     >>> lr.setParams("vector")
     Traceback (most recent call last):
         ...
     TypeError: Method setParams forces keyword arguments.

     .. versionadded:: 1.4.0
     """

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                  standardization=True, solver="auto", weightCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                  standardization=True, solver="auto", weightCol=None)
         """
         super(LinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.LinearRegression", self.uid)
         self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                   standardization=True, solver="auto", weightCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                   standardization=True, solver="auto", weightCol=None)
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)


 class LinearRegressionModel(JavaModel):
     """
     Model fitted by LinearRegression.

     .. versionadded:: 1.4.0
     """

     @property
     @since("1.4.0")
     def weights(self):
         """
         Model weights.
         """

         warnings.warn("weights is deprecated. Use coefficients instead.")
         return self._call_java("weights")

     @property
     @since("1.6.0")
     def coefficients(self):
         """
         Model coefficients.
         """
         return self._call_java("coefficients")

     @property
     @since("1.4.0")
     def intercept(self):
         """
         Model intercept.
         """
         return self._call_java("intercept")


 @inherit_doc
 class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                          HasWeightCol):
     """
     .. note:: Experimental

     Currently implemented using parallelized pool adjacent violators algorithm.
     Only univariate (single feature) algorithm supported.

     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> ir = IsotonicRegression()
     >>> model = ir.fit(df)
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
     >>> model.boundaries
     DenseVector([0.0, 1.0])
     """

     # a placeholder to make it appear in the generated doc
     isotonic = \
         Param(Params._dummy(), "isotonic",
               "whether the output sequence should be isotonic/increasing (true) or" +
               "antitonic/decreasing (false).")
     featureIndex = \
         Param(Params._dummy(), "featureIndex",
               "The index of the feature if featuresCol is a vector column, no effect otherwise.")

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  weightCol=None, isotonic=True, featureIndex=0):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  weightCol=None, isotonic=True, featureIndex=0):
         """
         super(IsotonicRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.IsotonicRegression", self.uid)
         self.isotonic = \
             Param(self, "isotonic",
                   "whether the output sequence should be isotonic/increasing (true) or" +
                   "antitonic/decreasing (false).")
         self.featureIndex = \
             Param(self, "featureIndex",
                   "The index of the feature if featuresCol is a vector column, no effect " +
                   "otherwise.")
         self._setDefault(isotonic=True, featureIndex=0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   weightCol=None, isotonic=True, featureIndex=0):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  weightCol=None, isotonic=True, featureIndex=0):
         Set the params for IsotonicRegression.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return IsotonicRegressionModel(java_model)

     def setIsotonic(self, value):
         """
         Sets the value of :py:attr:`isotonic`.
         """
         self._paramMap[self.isotonic] = value
         return self

     def getIsotonic(self):
         """
         Gets the value of isotonic or its default value.
         """
         return self.getOrDefault(self.isotonic)

     def setFeatureIndex(self, value):
         """
         Sets the value of :py:attr:`featureIndex`.
         """
         self._paramMap[self.featureIndex] = value
         return self

     def getFeatureIndex(self):
         """
         Gets the value of featureIndex or its default value.
         """
         return self.getOrDefault(self.featureIndex)


 class IsotonicRegressionModel(JavaModel):
     """
     .. note:: Experimental

     Model fitted by IsotonicRegression.
     """

     @property
     def boundaries(self):
         """
         Model boundaries.
         """
         return self._call_java("boundaries")

     @property
     def predictions(self):
         """
         Predictions associated with the boundaries at the same index, monotone because of isotonic
         regression.
         """
         return self._call_java("predictions")


 class TreeEnsembleParams(DecisionTreeParams):
     """
     Mixin for Decision Tree-based ensemble algorithms parameters.
     """

     # a placeholder to make it appear in the generated doc
     subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
                             "used for learning each decision tree, in range (0, 1].")

     def __init__(self):
         super(TreeEnsembleParams, self).__init__()
         #: param for Fraction of the training data, in range (0, 1].
         self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " +
                                      "used for learning each decision tree, in range (0, 1].")

     @since("1.4.0")
     def setSubsamplingRate(self, value):
         """
         Sets the value of :py:attr:`subsamplingRate`.
         """
         self._paramMap[self.subsamplingRate] = value
         return self

     @since("1.4.0")
     def getSubsamplingRate(self):
         """
         Gets the value of subsamplingRate or its default value.
         """
         return self.getOrDefault(self.subsamplingRate)


 class TreeRegressorParams(Params):
     """
     Private class to track supported impurity measures.
     """

     supportedImpurities = ["variance"]
     # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
                      "Supported options: " +
                      ", ".join(supportedImpurities))

     def __init__(self):
         super(TreeRegressorParams, self).__init__()
         #: param for Criterion used for information gain calculation (case-insensitive).
         self.impurity = Param(self, "impurity", "Criterion used for information " +
                               "gain calculation (case-insensitive). Supported options: " +
                               ", ".join(self.supportedImpurities))

     @since("1.4.0")
     def setImpurity(self, value):
         """
         Sets the value of :py:attr:`impurity`.
         """
         self._paramMap[self.impurity] = value
         return self

     @since("1.4.0")
     def getImpurity(self):
         """
         Gets the value of impurity or its default value.
         """
         return self.getOrDefault(self.impurity)


 class RandomForestParams(TreeEnsembleParams):
     """
     Private class to track supported random forest parameters.
     """

     supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
     # a placeholder to make it appear in the generated doc
     numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
     featureSubsetStrategy = \
         Param(Params._dummy(), "featureSubsetStrategy",
               "The number of features to consider for splits at each tree node. Supported " +
               "options: " + ", ".join(supportedFeatureSubsetStrategies))

     def __init__(self):
         super(RandomForestParams, self).__init__()
         #: param for Number of trees to train (>= 1).
         self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).")
         #: param for The number of features to consider for splits at each tree node.
         self.featureSubsetStrategy = \
             Param(self, "featureSubsetStrategy",
                   "The number of features to consider for splits at each tree node. Supported " +
                   "options: " + ", ".join(self.supportedFeatureSubsetStrategies))

     @since("1.4.0")
     def setNumTrees(self, value):
         """
         Sets the value of :py:attr:`numTrees`.
         """
         self._paramMap[self.numTrees] = value
         return self

     @since("1.4.0")
     def getNumTrees(self):
         """
         Gets the value of numTrees or its default value.
         """
         return self.getOrDefault(self.numTrees)

     @since("1.4.0")
     def setFeatureSubsetStrategy(self, value):
         """
         Sets the value of :py:attr:`featureSubsetStrategy`.
         """
         self._paramMap[self.featureSubsetStrategy] = value
         return self

     @since("1.4.0")
     def getFeatureSubsetStrategy(self):
         """
         Gets the value of featureSubsetStrategy or its default value.
         """
         return self.getOrDefault(self.featureSubsetStrategy)


 class GBTParams(TreeEnsembleParams):
     """
     Private class to track supported GBT params.
     """
     supportedLossTypes = ["squared", "absolute"]


 @inherit_doc
 class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                             DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree`
     learning algorithm for regression.
     It supports both continuous and categorical features.

     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> dt = DecisionTreeRegressor(maxDepth=2)
     >>> model = dt.fit(df)
     >>> model.depth
     1
     >>> model.numNodes
     3
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0

     .. versionadded:: 1.4.0
     """

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance")
         """
         super(DecisionTreeRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.DecisionTreeRegressor", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="variance")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                   impurity="variance"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="variance")
         Sets params for the DecisionTreeRegressor.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return DecisionTreeRegressionModel(java_model)


 @inherit_doc
 class DecisionTreeModel(JavaModel):
     """Abstraction for Decision Tree models.

     .. versionadded:: 1.5.0
     """

     @property
     @since("1.5.0")
     def numNodes(self):
         """Return number of nodes of the decision tree."""
         return self._call_java("numNodes")

     @property
     @since("1.5.0")
     def depth(self):
         """Return depth of the decision tree."""
         return self._call_java("depth")

     def __repr__(self):
         return self._call_java("toString")


 @inherit_doc
 class TreeEnsembleModels(JavaModel):
     """Represents a tree ensemble model.

     .. versionadded:: 1.5.0
     """

     @property
     @since("1.5.0")
     def treeWeights(self):
         """Return the weights for each tree"""
         return list(self._call_java("javaTreeWeights"))

     def __repr__(self):
         return self._call_java("toString")


 @inherit_doc
 class DecisionTreeRegressionModel(DecisionTreeModel):
     """
     Model fitted by DecisionTreeRegressor.

     .. versionadded:: 1.4.0
     """


 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed,
                             RandomForestParams, TreeRegressorParams, HasCheckpointInterval):
     """
     `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
     learning algorithm for regression.
     It supports both continuous and categorical features.

     >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
     >>> model = rf.fit(df)
     >>> allclose(model.treeWeights, [1.0, 1.0])
     True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     0.5

     .. versionadded:: 1.4.0
     """

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
                  featureSubsetStrategy="auto"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                  impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
                  featureSubsetStrategy="auto")
         """
         super(RandomForestRegressor, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.RandomForestRegressor", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
                          featureSubsetStrategy="auto")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20,
                   featureSubsetStrategy="auto"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \
                   impurity="variance", subsamplingRate=1.0, seed=None, numTrees=20, \
                   featureSubsetStrategy="auto")
         Sets params for linear regression.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return RandomForestRegressionModel(java_model)


 class RandomForestRegressionModel(TreeEnsembleModels):
     """
     Model fitted by RandomForestRegressor.

     .. versionadded:: 1.4.0
     """


 @inherit_doc
 class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
                    GBTParams, HasCheckpointInterval, HasStepSize, HasSeed):
     """
     `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)`
     learning algorithm for regression.
     It supports both continuous and categorical features.

     >>> from numpy import allclose
     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0)),
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> gbt = GBTRegressor(maxIter=5, maxDepth=2)
     >>> model = gbt.fit(df)
     >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
     True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
     >>> model.transform(test0).head().prediction
     0.0
     >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
     >>> model.transform(test1).head().prediction
     1.0

     .. versionadded:: 1.4.0
     """

     # a placeholder to make it appear in the generated doc
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes))

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                  checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
                  checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         """
         super(GBTRegressor, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
         #: param for Loss function which GBT tries to minimize (case-insensitive).
         self.lossType = Param(self, "lossType",
                               "Loss function which GBT tries to minimize (case-insensitive). " +
                               "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                          checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                   checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
                   checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
         Sets params for Gradient Boosted Tree Regression.
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return GBTRegressionModel(java_model)

     @since("1.4.0")
     def setLossType(self, value):
         """
         Sets the value of :py:attr:`lossType`.
         """
         self._paramMap[self.lossType] = value
         return self

     @since("1.4.0")
     def getLossType(self):
         """
         Gets the value of lossType or its default value.
         """
         return self.getOrDefault(self.lossType)


 class GBTRegressionModel(TreeEnsembleModels):
     """
     Model fitted by GBTRegressor.

     .. versionadded:: 1.4.0
     """


 @inherit_doc
 class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
                             HasFitIntercept, HasMaxIter, HasTol):
     """
     Accelerated Failure Time (AFT) Model Survival Regression

     Fit a parametric AFT survival regression model based on the Weibull distribution
     of the survival time.

     .. seealso:: `AFT Model <https://en.wikipedia.org/wiki/Accelerated_failure_time_model>`_

     >>> from pyspark.mllib.linalg import Vectors
     >>> df = sqlContext.createDataFrame([
     ...     (1.0, Vectors.dense(1.0), 1.0),
     ...     (0.0, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"])
     >>> aftsr = AFTSurvivalRegression()
     >>> model = aftsr.fit(df)
     >>> model.predict(Vectors.dense(6.3))
     1.0
     >>> model.predictQuantiles(Vectors.dense(6.3))
     DenseVector([0.0101, 0.0513, 0.1054, 0.2877, 0.6931, 1.3863, 2.3026, 2.9957, 4.6052])
     >>> model.transform(df).show()
     +-----+---------+------+----------+
     |label| features|censor|prediction|
     +-----+---------+------+----------+
     |  1.0|    [1.0]|   1.0|       1.0|
     |  0.0|(1,[],[])|   0.0|       1.0|
     +-----+---------+------+----------+
     ...

     .. versionadded:: 1.6.0
     """

     # a placeholder to make it appear in the generated doc
     censorCol = Param(Params._dummy(), "censorCol",
                       "censor column name. The value of this column could be 0 or 1. " +
                       "If the value is 1, it means the event has occurred i.e. " +
                       "uncensored; otherwise censored.")
     quantileProbabilities = \
         Param(Params._dummy(), "quantileProbabilities",
               "quantile probabilities array. Values of the quantile probabilities array " +
               "should be in the range (0, 1) and the array should be non-empty.")
     quantilesCol = Param(Params._dummy(), "quantilesCol",
                          "quantiles column name. This column will output quantiles of " +
                          "corresponding quantileProbabilities if it is set.")

     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor",
                  quantileProbabilities=None, quantilesCol=None):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
                  quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
                  quantilesCol=None)
         """
         super(AFTSurvivalRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid)
         #: Param for censor column name
         self.censorCol = Param(self,  "censorCol",
                                "censor column name. The value of this column could be 0 or 1. " +
                                "If the value is 1, it means the event has occurred i.e. " +
                                "uncensored; otherwise censored.")
         #: Param for quantile probabilities array
         self.quantileProbabilities = \
             Param(self, "quantileProbabilities",
                   "quantile probabilities array. Values of the quantile probabilities array " +
                   "should be in the range (0, 1) and the array should be non-empty.")
         #: Param for quantiles column name
         self.quantilesCol = Param(self, "quantilesCol",
                                   "quantiles column name. This column will output quantiles of " +
                                   "corresponding quantileProbabilities if it is set.")
         self._setDefault(censorCol="censor",
                          quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

     @keyword_only
     @since("1.6.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor",
                   quantileProbabilities=None, quantilesCol=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \
                   quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \
                   quantilesCol=None):
         """
         kwargs = self.setParams._input_kwargs
         return self._set(**kwargs)

     def _create_model(self, java_model):
         return AFTSurvivalRegressionModel(java_model)

     @since("1.6.0")
     def setCensorCol(self, value):
         """
         Sets the value of :py:attr:`censorCol`.
         """
         self._paramMap[self.censorCol] = value
         return self

     @since("1.6.0")
     def getCensorCol(self):
         """
         Gets the value of censorCol or its default value.
         """
         return self.getOrDefault(self.censorCol)

     @since("1.6.0")
     def setQuantileProbabilities(self, value):
         """
         Sets the value of :py:attr:`quantileProbabilities`.
         """
         self._paramMap[self.quantileProbabilities] = value
         return self

     @since("1.6.0")
     def getQuantileProbabilities(self):
         """
         Gets the value of quantileProbabilities or its default value.
         """
         return self.getOrDefault(self.quantileProbabilities)

     @since("1.6.0")
     def setQuantilesCol(self, value):
         """
         Sets the value of :py:attr:`quantilesCol`.
         """
         self._paramMap[self.quantilesCol] = value
         return self

     @since("1.6.0")
     def getQuantilesCol(self):
         """
         Gets the value of quantilesCol or its default value.
         """
         return self.getOrDefault(self.quantilesCol)


 class AFTSurvivalRegressionModel(JavaModel):
     """
     Model fitted by AFTSurvivalRegression.

     .. versionadded:: 1.6.0
     """

     @property
     @since("1.6.0")
     def coefficients(self):
         """
         Model coefficients.
         """
         return self._call_java("coefficients")

     @property
     @since("1.6.0")
     def intercept(self):
         """
         Model intercept.
         """
         return self._call_java("intercept")

     @property
     @since("1.6.0")
     def scale(self):
         """
         Model scale paramter.
         """
         return self._call_java("scale")

     def predictQuantiles(self, features):
         """
         Predicted Quantiles
         """
         return self._call_java("predictQuantiles", features)

     def predict(self, features):
         """
         Predicted value
         """
         return self._call_java("predict", features)


 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext
     from pyspark.sql import SQLContext
     globs = globals().copy()
     # The small batch size here ensures that we see multiple batches,
     # even in these small test examples:
     sc = SparkContext("local[2]", "ml.regression tests")
     sqlContext = SQLContext(sc)
     globs['sc'] = sc
     globs['sqlContext'] = sqlContext
     (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
     sc.stop()
     if failure_count:
         exit(-1)