R/pkg/R/mllib_regression.R - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 # mllib_regression.R: Provides methods for MLlib regression algorithms
 #                     (except for tree-based algorithms) integration

 #' S4 class that represents a AFTSurvivalRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper
 #' @note AFTSurvivalRegressionModel since 2.0.0
 setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))

 #' S4 class that represents a generalized linear model
 #'
 #' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
 #' @note GeneralizedLinearRegressionModel since 2.0.0
 setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))

 #' S4 class that represents an IsotonicRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))

 #' S4 class that represents a LinearRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala LinearRegressionWrapper
 #' @note LinearRegressionModel since 3.1.0
 setClass("LinearRegressionModel", representation(jobj = "jobj"))

 #' S4 class that represents a FMRegressionModel
 #'
 #' @param jobj a Java object reference to the backing Scala FMRegressorWrapper
 #' @note FMRegressionModel since 3.1.0
 setClass("FMRegressionModel", representation(jobj = "jobj"))

 #' Generalized Linear Models
 #'
 #' Fits generalized linear model against a SparkDataFrame.
 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', '-', '*', and '^'.
 #' @param family a description of the error distribution and link function to be used in the model.
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
 #'               \code{Gamma}, \code{poisson} and \code{tweedie}.
 #'
 #'               Note that there are two ways to specify the tweedie family.
 #'               \itemize{
 #'                \item Set \code{family = "tweedie"} and specify the var.power and link.power;
 #'                \item When package \code{statmod} is loaded, the tweedie family is specified
 #'                using the family definition therein, i.e., \code{tweedie(var.power, link.power)}.
 #'               }
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param regParam regularization parameter for L2 regularization.
 #' @param var.power the power in the variance function of the Tweedie distribution which provides
 #'                      the relationship between the variance and mean of the distribution. Only
 #'                      applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to the Tweedie family.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
 #'                               decide the base level of a string feature as the last category
 #'                               after ordering is dropped when encoding strings. Supported options
 #'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
 #'                               "alphabetAsc". The default value is "frequencyDesc". When the
 #'                               ordering is set to "alphabetDesc", this drops the same category
 #'                               as R when encoding strings.
 #' @param offsetCol the offset column name. If this is not set or empty, we treat all instance
 #'                  offsets as 0.0. The feature specified as offset has a constant coefficient of
 #'                  1.0.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
 #' @rdname spark.glm
 #' @name spark.glm
 #' @examples
 #' \dontrun{
 #' sparkR.session()
 #' t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
 #' df <- createDataFrame(t)
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
 #' summary(model)
 #'
 #' # fitted values on training data
 #' fitted <- predict(model, df)
 #' head(select(fitted, "Freq", "prediction"))
 #'
 #' # save fitted model to input path
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #'
 #' # can also read back the saved model and print
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #'
 #' # note that the default string encoding is different from R's glm
 #' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t)
 #' summary(model2)
 #' # use stringIndexerOrderType = "alphabetDesc" to force string encoding
 #' # to be consistent with R
 #' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
 #'                    stringIndexerOrderType = "alphabetDesc")
 #' summary(model3)
 #'
 #' # fit tweedie model
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
 #'                    var.power = 1.2, link.power = 0)
 #' summary(model)
 #'
 #' # use the tweedie family from statmod
 #' library(statmod)
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = tweedie(1.2, 0))
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
                    regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
                                               "alphabetDesc", "alphabetAsc"),
                    offsetCol = NULL) {

             stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             if (is.character(family)) {
               # Handle when family = "tweedie"
               if (tolower(family) == "tweedie") {
                 family <- list(family = "tweedie", link = NULL)
               } else {
                 family <- get(family, mode = "function", envir = parent.frame())
               }
             }
             if (is.function(family)) {
               family <- family()
             }
             if (is.null(family$family)) {
               print(family)
               stop("'family' not recognized")
             }
             # Handle when family = statmod::tweedie()
             if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
               var.power <- log(family$variance(exp(1)))
               link.power <- log(family$linkfun(exp(1)))
               family <- list(family = "tweedie", link = NULL)
             }

             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(weightCol) && weightCol == "") {
               weightCol <- NULL
             } else if (!is.null(weightCol)) {
               weightCol <- as.character(weightCol)
             }

             if (!is.null(offsetCol)) {
               offsetCol <- as.character(offsetCol)
               if (nchar(offsetCol) == 0) {
                 offsetCol <- NULL
               }
             }

             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
                                 as.double(var.power), as.double(link.power),
                                 stringIndexerOrderType, offsetCol)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })

 #' Generalized Linear Models (R-compliant)
 #'
 #' Fits a generalized linear model, similarly to R's glm().
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data a SparkDataFrame or R's glm data for training.
 #' @param family a description of the error distribution and link function to be used in the model.
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
 #'               \code{poisson}, \code{Gamma}, and \code{tweedie}.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
 #' @param var.power the index of the power variance function in the Tweedie family.
 #' @param link.power the index of the power link function in the Tweedie family.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
 #'                               decide the base level of a string feature as the last category
 #'                               after ordering is dropped when encoding strings. Supported options
 #'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
 #'                               "alphabetAsc". The default value is "frequencyDesc". When the
 #'                               ordering is set to "alphabetDesc", this drops the same category
 #'                               as R when encoding strings.
 #' @param offsetCol the offset column name. If this is not set or empty, we treat all instance
 #'                  offsets as 0.0. The feature specified as offset has a constant coefficient of
 #'                  1.0.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @aliases glm
 #' @examples
 #' \dontrun{
 #' sparkR.session()
 #' t <- as.data.frame(Titanic)
 #' df <- createDataFrame(t)
 #' model <- glm(Freq ~ Sex + Age, df, family = "gaussian")
 #' summary(model)
 #' }
 #' @note glm since 1.5.0
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
           function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
                    var.power = 0.0, link.power = 1.0 - var.power,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
                                               "alphabetDesc", "alphabetAsc"),
                    offsetCol = NULL) {
             spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
                       var.power = var.power, link.power = link.power,
                       stringIndexerOrderType = stringIndexerOrderType,
                       offsetCol = offsetCol)
           })

 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().

 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'         The list of components includes at least the \code{coefficients} (coefficients matrix,
 #'         which includes coefficients, standard error of coefficients, t value and p value),
 #'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} (AIC)
 #'         and \code{iter} (number of iterations IRLS takes). If there are collinear columns in
 #'         the data, the coefficients matrix only provides coefficients.
 #' @rdname spark.glm
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
 setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
           function(object) {
             jobj <- object@jobj
             is.loaded <- callJMethod(jobj, "isLoaded")
             features <- callJMethod(jobj, "rFeatures")
             coefficients <- callJMethod(jobj, "rCoefficients")
             dispersion <- callJMethod(jobj, "rDispersion")
             null.deviance <- callJMethod(jobj, "rNullDeviance")
             deviance <- callJMethod(jobj, "rDeviance")
             df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
             df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
             iter <- callJMethod(jobj, "rNumIterations")
             family <- callJMethod(jobj, "rFamily")
             aic <- callJMethod(jobj, "rAic")
             if (family == "tweedie" && aic == 0) aic <- NA
             deviance.resid <- if (is.loaded) {
               NULL
             } else {
               dataFrame(callJMethod(jobj, "rDevianceResiduals"))
             }
             # If the underlying WeightedLeastSquares using "normal" solver, we can provide
             # coefficients, standard error of coefficients, t value and p value. Otherwise,
             # it will be fitted by local "l-bfgs", we can only provide coefficients.
             if (length(features) == length(coefficients)) {
               coefficients <- matrix(unlist(coefficients), ncol = 1)
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
             } else {
               coefficients <- matrix(unlist(coefficients), ncol = 4)
               colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
               rownames(coefficients) <- unlist(features)
             }
             ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
                         dispersion = dispersion, null.deviance = null.deviance,
                         deviance = deviance, df.null = df.null, df.residual = df.residual,
                         aic = aic, iter = iter, family = family, is.loaded = is.loaded)
             class(ans) <- "summary.GeneralizedLinearRegressionModel"
             ans
           })

 #  Prints the summary of GeneralizedLinearRegressionModel

 #' @rdname spark.glm
 #' @param x summary object of fitted generalized linear model returned by \code{summary} function.
 #' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0
 print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
   if (x$is.loaded) {
     cat("\nSaved-loaded model does not support output 'Deviance Residuals'.\n")
   } else {
     x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
     c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
     x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
     cat("\nDeviance Residuals: \n")
     cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
     print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)
   }

   cat("\nCoefficients:\n")
   print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)

   cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
     ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
     format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
     " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
     1L, paste, collapse = " "), sep = "")
   cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
     "Number of Fisher Scoring iterations: ", x$iter, "\n\n", sep = "")
   invisible(x)
   }

 #  Makes predictions from a generalized linear model produced by glm() or spark.glm(),
 #  similarly to R's predict().

 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted labels in a column named
 #'         "prediction".
 #' @rdname spark.glm
 #' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
 setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
           })

 #  Saves the generalized linear model to the input path.

 #' @param path the directory where the model is saved.
 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
 #' @rdname spark.glm
 #' @note write.ml(GeneralizedLinearRegressionModel, character) since 2.0.0
 setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })

 #' Isotonic Regression Model
 #'
 #' Fits an Isotonic Regression model against a SparkDataFrame, similarly to R's isoreg().
 #' Users can print, make predictions on the produced model and save the model to the input path.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param isotonic Whether the output sequence should be isotonic/increasing (TRUE) or
 #'                 antitonic/decreasing (FALSE).
 #' @param featureIndex The index of the feature if \code{featuresCol} is a vector column
 #'                     (default: 0), no effect otherwise.
 #' @param weightCol The weight column name.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.isoreg} returns a fitted Isotonic Regression model.
 #' @rdname spark.isoreg
 #' @aliases spark.isoreg,SparkDataFrame,formula-method
 #' @name spark.isoreg
 #' @examples
 #' \dontrun{
 #' sparkR.session()
 #' data <- list(list(7.0, 0.0), list(5.0, 1.0), list(3.0, 2.0),
 #'         list(5.0, 3.0), list(1.0, 4.0))
 #' df <- createDataFrame(data, c("label", "feature"))
 #' model <- spark.isoreg(df, label ~ feature, isotonic = FALSE)
 #' # return model boundaries and prediction as lists
 #' result <- summary(model, df)
 #' # prediction based on fitted model
 #' predict_data <- list(list(-2.0), list(-1.0), list(0.5),
 #'                 list(0.75), list(1.0), list(2.0), list(9.0))
 #' predict_df <- createDataFrame(predict_data, c("feature"))
 #' # get prediction column
 #' predict_result <- collect(select(predict(model, predict_df), "prediction"))
 #'
 #' # save fitted model to input path
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #'
 #' # can also read back the saved model and print
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #' }
 #' @note spark.isoreg since 2.1.0
 setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
             formula <- paste(deparse(formula), collapse = "")

             if (!is.null(weightCol) && weightCol == "") {
               weightCol <- NULL
             } else if (!is.null(weightCol)) {
               weightCol <- as.character(weightCol)
             }

             jobj <- callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit",
                                 data@sdf, formula, as.logical(isotonic), as.integer(featureIndex),
                                 weightCol)
             new("IsotonicRegressionModel", jobj = jobj)
           })

 #  Get the summary of an IsotonicRegressionModel model

 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'         The list includes model's \code{boundaries} (boundaries in increasing order)
 #'         and \code{predictions} (predictions associated with the boundaries at the same index).
 #' @rdname spark.isoreg
 #' @aliases summary,IsotonicRegressionModel-method
 #' @note summary(IsotonicRegressionModel) since 2.1.0
 setMethod("summary", signature(object = "IsotonicRegressionModel"),
           function(object) {
             jobj <- object@jobj
             boundaries <- callJMethod(jobj, "boundaries")
             predictions <- callJMethod(jobj, "predictions")
             list(boundaries = boundaries, predictions = predictions)
           })

 #  Predicted values based on an isotonicRegression model

 #' @param object a fitted IsotonicRegressionModel.
 #' @param newData SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted values.
 #' @rdname spark.isoreg
 #' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method
 #' @note predict(IsotonicRegressionModel) since 2.1.0
 setMethod("predict", signature(object = "IsotonicRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
           })

 #  Save fitted IsotonicRegressionModel to the input path

 #' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
 #' @rdname spark.isoreg
 #' @aliases write.ml,IsotonicRegressionModel,character-method
 #' @note write.ml(IsotonicRegression, character) since 2.1.0
 setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })

 #' Accelerated Failure Time (AFT) Survival Regression Model
 #'
 #' \code{spark.survreg} fits an accelerated failure time (AFT) survival regression model on
 #' a SparkDataFrame. Users can call \code{summary} to get a summary of the fitted AFT model,
 #' \code{predict} to make predictions on new data, and \code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #'                Note that operator '.' is not supported currently.
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
 #'                         dimensions of features or the number of partitions are large, this
 #'                         param could be adjusted to a larger size. This is an expert parameter.
 #'                         Default value should be good for most cases.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
 #'                               decide the base level of a string feature as the last category
 #'                               after ordering is dropped when encoding strings. Supported options
 #'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
 #'                               "alphabetAsc". The default value is "frequencyDesc". When the
 #'                               ordering is set to "alphabetDesc", this drops the same category
 #'                               as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
 #' @seealso survival: \url{https://cran.r-project.org/package=survival}
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(ovarian)
 #' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx)
 #'
 #' # get a summary of the model
 #' summary(model)
 #'
 #' # make predictions
 #' predicted <- predict(model, df)
 #' showDF(predicted)
 #'
 #' # save and load the model
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #' }
 #' @note spark.survreg since 2.0.0
 setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, aggregationDepth = 2,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
                                               "alphabetDesc", "alphabetAsc")) {
             stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             formula <- paste(deparse(formula), collapse = "")
             jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
                                 "fit", formula, data@sdf, as.integer(aggregationDepth),
                                 stringIndexerOrderType)
             new("AFTSurvivalRegressionModel", jobj = jobj)
           })

 #  Returns a summary of the AFT survival regression model produced by spark.survreg,
 #  similarly to R's summary().

 #' @param object a fitted AFT survival regression model.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'         The list includes the model's \code{coefficients} (features, coefficients,
 #'         intercept and log(scale)).
 #' @rdname spark.survreg
 #' @note summary(AFTSurvivalRegressionModel) since 2.0.0
 setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
           function(object) {
             jobj <- object@jobj
             features <- callJMethod(jobj, "rFeatures")
             coefficients <- callJMethod(jobj, "rCoefficients")
             coefficients <- as.matrix(unlist(coefficients))
             colnames(coefficients) <- c("Value")
             rownames(coefficients) <- unlist(features)
             list(coefficients = coefficients)
           })

 #  Makes predictions from an AFT survival regression model or a model produced by
 #  spark.survreg, similarly to R package survival's predict.

 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns a SparkDataFrame containing predicted values
 #'         on the original scale of the data (mean predicted value at scale = 1.0).
 #' @rdname spark.survreg
 #' @note predict(AFTSurvivalRegressionModel) since 2.0.0
 setMethod("predict", signature(object = "AFTSurvivalRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
           })

 #  Saves the AFT survival regression model to the input path.

 #' @param path the directory where the model is saved.
 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #' @rdname spark.survreg
 #' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
 #' @seealso \link{write.ml}
 setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })

 #' Linear Regression Model
 #'
 #' \code{spark.lm} fits a linear regression model against a SparkDataFrame.
 #' Users can call \code{summary} to print a summary of the fitted model,
 #' \code{predict} to make predictions on new data,
 #' and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param maxIter maximum iteration number.
 #' @param regParam the regularization parameter.
 #' @param elasticNetParam the ElasticNet mixing parameter, in range [0, 1].
 #'        For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
 #' @param tol convergence tolerance of iterations.
 #' @param standardization whether to standardize the training features before fitting the model.
 #' @param weightCol weight column name.
 #' @param aggregationDepth suggested depth for treeAggregate (>= 2).
 #' @param loss the loss function to be optimized. Supported options: "squaredError" and "huber".
 #' @param epsilon the shape parameter to control the amount of robustness.
 #' @param solver The solver algorithm for optimization.
 #'        Supported options: "l-bfgs", "normal" and "auto".
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
 #'                               decide the base level of a string feature as the last category
 #'                               after ordering is dropped when encoding strings. Supported options
 #'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
 #'                               "alphabetAsc". The default value is "frequencyDesc". When the
 #'                               ordering is set to "alphabetDesc", this drops the same category
 #'                               as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.lm} returns a fitted Linear Regression Model.
 #' @rdname spark.lm
 #' @aliases spark.lm,SparkDataFrame,formula-method
 #' @name spark.lm
 #' @seealso \link{read.ml}
 #' @examples
 #' \dontrun{
 #' df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
 #'
 #' # fit Linear Regression Model
 #' model <- spark.lm(df, label ~ features, regParam = 0.01, maxIter = 1)
 #'
 #' # get the summary of the model
 #' summary(model)
 #'
 #' # make predictions
 #' predictions <- predict(model, df)
 #'
 #' # save and load the model
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #' }
 #' @note spark.lm since 3.1.0
 setMethod("spark.lm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula,
                    maxIter = 100L, regParam = 0.0, elasticNetParam = 0.0,
                    tol = 1e-6, standardization = TRUE,
                    solver = c("auto", "l-bfgs", "normal"),
                    weightCol = NULL, aggregationDepth = 2L,
                    loss = c("squaredError", "huber"), epsilon = 1.35,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
                                               "alphabetDesc", "alphabetAsc")) {


             formula <- paste(deparse(formula), collapse = "")


             solver <- match.arg(solver)
             loss <- match.arg(loss)
             stringIndexerOrderType <- match.arg(stringIndexerOrderType)


             if (!is.null(weightCol) && weightCol == "") {
               weightCol <- NULL
             } else if (!is.null(weightCol)) {
               weightCol <- as.character(weightCol)
             }


             jobj <- callJStatic("org.apache.spark.ml.r.LinearRegressionWrapper",
                                 "fit",
                                 data@sdf,
                                 formula,
                                 as.integer(maxIter),
                                 as.numeric(regParam),
                                 as.numeric(elasticNetParam),
                                 as.numeric(tol),
                                 as.logical(standardization),
                                 solver,
                                 weightCol,
                                 as.integer(aggregationDepth),
                                 loss,
                                 as.numeric(epsilon),
                                 stringIndexerOrderType)
             new("LinearRegressionModel", jobj = jobj)
           })


 #  Returns the summary of a Linear Regression model produced by \code{spark.lm}


 #' @param object a Linear Regression Model model fitted by \code{spark.lm}.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'
 #' @rdname spark.lm
 #' @note summary(LinearRegressionModel) since 3.1.0
 setMethod("summary", signature(object = "LinearRegressionModel"),
           function(object) {
             jobj <- object@jobj
             features <- callJMethod(jobj, "rFeatures")
             coefficients <- callJMethod(jobj, "rCoefficients")
             coefficients <- as.matrix(unlist(coefficients))
             colnames(coefficients) <- c("Estimate")
             rownames(coefficients) <- unlist(features)
             numFeatures <- callJMethod(jobj, "numFeatures")


             list(
               coefficients = coefficients,
               numFeatures = numFeatures
             )
           })


 #  Predicted values based on an LinearRegressionModel model


 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns the predicted values based on a LinearRegressionModel.
 #'
 #' @rdname spark.lm
 #' @aliases predict,LinearRegressionModel,SparkDataFrame-method
 #' @note predict(LinearRegressionModel) since 3.1.0
 setMethod("predict", signature(object = "LinearRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
           })


 #  Save fitted LinearRegressionModel to the input path


 #' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
 #' @rdname spark.lm
 #' @aliases write.ml,LinearRegressionModel,character-method
 #' @note write.ml(LinearRegressionModel, character) since 3.1.0
 setMethod("write.ml", signature(object = "LinearRegressionModel", path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })

 #' Factorization Machines Regression Model
 #'
 #' \code{spark.fmRegressor} fits a factorization regression model against a SparkDataFrame.
 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param factorSize dimensionality of the factors.
 #' @param fitLinear whether to fit linear term.  # TODO Can we express this with formula?
 #' @param regParam the regularization parameter.
 #' @param miniBatchFraction the mini-batch fraction parameter.
 #' @param initStd the standard deviation of initial coefficients.
 #' @param maxIter maximum iteration number.
 #' @param stepSize stepSize parameter.
 #' @param tol convergence tolerance of iterations.
 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "adamW".
 #' @param seed seed parameter for weights initialization.
 #' @param stringIndexerOrderType how to order categories of a string feature column. This is used to
 #'                               decide the base level of a string feature as the last category
 #'                               after ordering is dropped when encoding strings. Supported options
 #'                               are "frequencyDesc", "frequencyAsc", "alphabetDesc", and
 #'                               "alphabetAsc". The default value is "frequencyDesc". When the
 #'                               ordering is set to "alphabetDesc", this drops the same category
 #'                               as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.fmRegressor} returns a fitted Factorization Machines Regression Model.
 #'
 #' @rdname spark.fmRegressor
 #' @aliases spark.fmRegressor,SparkDataFrame,formula-method
 #' @name spark.fmRegressor
 #' @seealso \link{read.ml}
 #' @examples
 #' \dontrun{
 #' df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
 #'
 #' # fit Factorization Machines Regression Model
 #' model <- spark.fmRegressor(
 #'   df, label ~ features,
 #'   regParam = 0.01, maxIter = 10, fitLinear = TRUE
 #' )
 #'
 #' # get the summary of the model
 #' summary(model)
 #'
 #' # make predictions
 #' predictions <- predict(model, df)
 #'
 #' # save and load the model
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #' }
 #' @note spark.fmRegressor since 3.1.0
 setMethod("spark.fmRegressor", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, factorSize = 8, fitLinear = TRUE, regParam = 0.0,
                    miniBatchFraction = 1.0, initStd = 0.01, maxIter = 100, stepSize=1.0,
                    tol = 1e-6, solver = c("adamW", "gd"), seed = NULL,
                    stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
                                               "alphabetDesc", "alphabetAsc")) {


             formula <- paste(deparse(formula), collapse = "")


             if (!is.null(seed)) {
               seed <- as.character(as.integer(seed))
             }


             solver <- match.arg(solver)
             stringIndexerOrderType <- match.arg(stringIndexerOrderType)


             jobj <- callJStatic("org.apache.spark.ml.r.FMRegressorWrapper",
                                 "fit",
                                 data@sdf,
                                 formula,
                                 as.integer(factorSize),
                                 as.logical(fitLinear),
                                 as.numeric(regParam),
                                 as.numeric(miniBatchFraction),
                                 as.numeric(initStd),
                                 as.integer(maxIter),
                                 as.numeric(stepSize),
                                 as.numeric(tol),
                                 solver,
                                 seed,
                                 stringIndexerOrderType)
             new("FMRegressionModel", jobj = jobj)
           })


 #  Returns the summary of a FM Regression model produced by \code{spark.fmRegressor}


 #' @param object a FM Regression Model model fitted by \code{spark.fmRegressor}.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
 #'
 #' @rdname spark.fmRegressor
 #' @note summary(FMRegressionModel) since 3.1.0
 setMethod("summary", signature(object = "FMRegressionModel"),
           function(object) {
             jobj <- object@jobj
             features <- callJMethod(jobj, "rFeatures")
             coefficients <- callJMethod(jobj, "rCoefficients")
             coefficients <- as.matrix(unlist(coefficients))
             colnames(coefficients) <- c("Estimate")
             rownames(coefficients) <- unlist(features)
             numFeatures <- callJMethod(jobj, "numFeatures")
             raw_factors <- unlist(callJMethod(jobj, "rFactors"))
             factor_size <- callJMethod(jobj, "factorSize")


             list(
               coefficients = coefficients,
               factors = matrix(raw_factors, ncol = factor_size),
               numFeatures = numFeatures,
               factorSize = factor_size
             )
           })


 #  Predicted values based on an FMRegressionModel model


 #' @param newData a SparkDataFrame for testing.
 #' @return \code{predict} returns the predicted values based on an FMRegressionModel.
 #'
 #' @rdname spark.fmRegressor
 #' @aliases predict,FMRegressionModel,SparkDataFrame-method
 #' @note predict(FMRegressionModel) since 3.1.0
 setMethod("predict", signature(object = "FMRegressionModel"),
           function(object, newData) {
             predict_internal(object, newData)
           })


 #  Save fitted FMRegressionModel to the input path


 #' @param path The directory where the model is saved.
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
 #'
 #' @rdname spark.fmRegressor
 #' @aliases write.ml,FMRegressionModel,character-method
 #' @note write.ml(FMRegressionModel, character) since 3.1.0
 setMethod("write.ml", signature(object = "FMRegressionModel", path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })