| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| # mllib_regression.R: Provides methods for MLlib classification algorithms |
| # (except for tree-based algorithms) integration |
| |
| #' S4 class that represents an LinearSVCModel |
| #' |
| #' @param jobj a Java object reference to the backing Scala LinearSVCModel |
| #' @note LinearSVCModel since 2.2.0 |
| setClass("LinearSVCModel", representation(jobj = "jobj")) |
| |
| #' S4 class that represents an LogisticRegressionModel |
| #' |
| #' @param jobj a Java object reference to the backing Scala LogisticRegressionModel |
| #' @note LogisticRegressionModel since 2.1.0 |
| setClass("LogisticRegressionModel", representation(jobj = "jobj")) |
| |
| #' S4 class that represents a MultilayerPerceptronClassificationModel |
| #' |
| #' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper |
| #' @note MultilayerPerceptronClassificationModel since 2.1.0 |
| setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) |
| |
| #' S4 class that represents a NaiveBayesModel |
| #' |
| #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper |
| #' @note NaiveBayesModel since 2.0.0 |
| setClass("NaiveBayesModel", representation(jobj = "jobj")) |
| |
| #' S4 class that represents a FMClassificationModel |
| #' |
| #' @param jobj a Java object reference to the backing Scala FMClassifierWrapper |
| #' @note FMClassificationModel since 3.1.0 |
| setClass("FMClassificationModel", representation(jobj = "jobj")) |
| |
| #' Linear SVM Model |
| #' |
| #' Fits a linear SVM model against a SparkDataFrame, similar to svm in e1071 package. |
| #' Currently only supports binary classification model with linear kernel. |
| #' Users can print, make predictions on the produced model and save the model to the input path. |
| #' |
| #' @param data SparkDataFrame for training. |
| #' @param formula A symbolic description of the model to be fitted. Currently only a few formula |
| #' operators are supported, including '~', '.', ':', '+', '-', '*', and '^'. |
| #' @param regParam The regularization parameter. Only supports L2 regularization currently. |
| #' @param maxIter Maximum iteration number. |
| #' @param tol Convergence tolerance of iterations. |
| #' @param standardization Whether to standardize the training features before fitting the model. |
| #' The coefficients of models will be always returned on the original scale, |
| #' so it will be transparent for users. Note that with/without |
| #' standardization, the models should be always converged to the same |
| #' solution when no regularization is applied. |
| #' @param threshold The threshold in binary classification applied to the linear model prediction. |
| #' This threshold can be any real number, where Inf will make all predictions 0.0 |
| #' and -Inf will make all predictions 1.0. |
| #' @param weightCol The weight column name. |
| #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the |
| #' dimensions of features or the number of partitions are large, this param |
| #' could be adjusted to a larger size. |
| #' This is an expert parameter. Default value should be good for most cases. |
| #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and |
| #' label column of string type. |
| #' Supported options: "skip" (filter out rows with invalid data), |
| #' "error" (throw an error), "keep" (put invalid data in |
| #' a special additional bucket, at index numLabels). Default |
| #' is "error". |
| #' @param ... additional arguments passed to the method. |
| #' @return \code{spark.svmLinear} returns a fitted linear SVM model. |
| #' @rdname spark.svmLinear |
| #' @aliases spark.svmLinear,SparkDataFrame,formula-method |
| #' @name spark.svmLinear |
| #' @examples |
| #' \dontrun{ |
| #' sparkR.session() |
| #' t <- as.data.frame(Titanic) |
| #' training <- createDataFrame(t) |
| #' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5) |
| #' summary <- summary(model) |
| #' |
| #' # fitted values on training data |
| #' fitted <- predict(model, training) |
| #' |
| #' # save fitted model to input path |
| #' path <- "path/to/model" |
| #' write.ml(model, path) |
| #' |
| #' # can also read back the saved model and predict |
| #' # Note that summary deos not work on loaded model |
| #' savedModel <- read.ml(path) |
| #' summary(savedModel) |
| #' } |
| #' @note spark.svmLinear since 2.2.0 |
| setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"), |
| function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE, |
| threshold = 0.0, weightCol = NULL, aggregationDepth = 2, |
| handleInvalid = c("error", "keep", "skip")) { |
| formula <- paste(deparse(formula), collapse = "") |
| |
| if (!is.null(weightCol) && weightCol == "") { |
| weightCol <- NULL |
| } else if (!is.null(weightCol)) { |
| weightCol <- as.character(weightCol) |
| } |
| |
| handleInvalid <- match.arg(handleInvalid) |
| |
| jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit", |
| data@sdf, formula, as.numeric(regParam), as.integer(maxIter), |
| as.numeric(tol), as.logical(standardization), as.numeric(threshold), |
| weightCol, as.integer(aggregationDepth), handleInvalid) |
| new("LinearSVCModel", jobj = jobj) |
| }) |
| |
| # Predicted values based on a LinearSVCModel model |
| |
| #' @param newData a SparkDataFrame for testing. |
| #' @return \code{predict} returns the predicted values based on a LinearSVCModel. |
| #' @rdname spark.svmLinear |
| #' @aliases predict,LinearSVCModel,SparkDataFrame-method |
| #' @note predict(LinearSVCModel) since 2.2.0 |
| setMethod("predict", signature(object = "LinearSVCModel"), |
| function(object, newData) { |
| predict_internal(object, newData) |
| }) |
| |
| # Get the summary of a LinearSVCModel |
| |
| #' @param object a LinearSVCModel fitted by \code{spark.svmLinear}. |
| #' @return \code{summary} returns summary information of the fitted model, which is a list. |
| #' The list includes \code{coefficients} (coefficients of the fitted model), |
| #' \code{numClasses} (number of classes), \code{numFeatures} (number of features). |
| #' @rdname spark.svmLinear |
| #' @aliases summary,LinearSVCModel-method |
| #' @note summary(LinearSVCModel) since 2.2.0 |
| setMethod("summary", signature(object = "LinearSVCModel"), |
| function(object) { |
| jobj <- object@jobj |
| features <- callJMethod(jobj, "rFeatures") |
| coefficients <- callJMethod(jobj, "rCoefficients") |
| coefficients <- as.matrix(unlist(coefficients)) |
| colnames(coefficients) <- c("Estimate") |
| rownames(coefficients) <- unlist(features) |
| numClasses <- callJMethod(jobj, "numClasses") |
| numFeatures <- callJMethod(jobj, "numFeatures") |
| list(coefficients = coefficients, numClasses = numClasses, numFeatures = numFeatures) |
| }) |
| |
| # Save fitted LinearSVCModel to the input path |
| |
| #' @param path The directory where the model is saved. |
| #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE |
| #' which means throw exception if the output path exists. |
| #' |
| #' @rdname spark.svmLinear |
| #' @aliases write.ml,LinearSVCModel,character-method |
| #' @note write.ml(LogisticRegression, character) since 2.2.0 |
| setMethod("write.ml", signature(object = "LinearSVCModel", path = "character"), |
| function(object, path, overwrite = FALSE) { |
| write_internal(object, path, overwrite) |
| }) |
| |
| #' Logistic Regression Model |
| #' |
| #' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary |
| #' logistic regression with pivoting; "multinomial": Multinomial logistic (softmax) regression |
| #' without pivoting, similar to glmnet. Users can print, make predictions on the produced model |
| #' and save the model to the input path. |
| #' |
| #' @param data SparkDataFrame for training. |
| #' @param formula A symbolic description of the model to be fitted. Currently only a few formula |
| #' operators are supported, including '~', '.', ':', '+', and '-'. |
| #' @param regParam the regularization parameter. |
| #' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 |
| #' penalty. For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, |
| #' the penalty is a combination of L1 and L2. Default is 0.0 which is an |
| #' L2 penalty. |
| #' @param maxIter maximum iteration number. |
| #' @param tol convergence tolerance of iterations. |
| #' @param family the name of family which is a description of the label distribution to be used |
| #' in the model. |
| #' Supported options: |
| #' \itemize{ |
| #' \item{"auto": Automatically select the family based on the number of classes: |
| #' If number of classes == 1 || number of classes == 2, set to "binomial". |
| #' Else, set to "multinomial".} |
| #' \item{"binomial": Binary logistic regression with pivoting.} |
| #' \item{"multinomial": Multinomial logistic (softmax) regression without |
| #' pivoting.} |
| #' } |
| #' @param standardization whether to standardize the training features before fitting the model. |
| #' The coefficients of models will be always returned on the original scale, |
| #' so it will be transparent for users. Note that with/without |
| #' standardization, the models should be always converged to the same |
| #' solution when no regularization is applied. Default is TRUE, same as |
| #' glmnet. |
| #' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of |
| #' class label 1 is > threshold, then predict 1, else 0. A high threshold |
| #' encourages the model to predict 0 more often; a low threshold encourages the |
| #' model to predict 1 more often. Note: Setting this with threshold p is |
| #' equivalent to setting thresholds c(1-p, p). In multiclass (or binary) |
| #' classification to adjust the probability of predicting each class. Array must |
| #' have length equal to the number of classes, with values > 0, excepting that |
| #' at most one value may be 0. The class with largest value p/t is predicted, |
| #' where p is the original probability of that class and t is the class's |
| #' threshold. |
| #' @param weightCol The weight column name. |
| #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the |
| #' dimensions of features or the number of partitions are large, this param |
| #' could be adjusted to a larger size. This is an expert parameter. Default |
| #' value should be good for most cases. |
| #' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound |
| #' constrained optimization. |
| #' The bound matrix must be compatible with the shape (1, number |
| #' of features) for binomial regression, or (number of classes, |
| #' number of features) for multinomial regression. |
| #' It is a R matrix. |
| #' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound |
| #' constrained optimization. |
| #' The bound matrix must be compatible with the shape (1, number |
| #' of features) for binomial regression, or (number of classes, |
| #' number of features) for multinomial regression. |
| #' It is a R matrix. |
| #' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained |
| #' optimization. |
| #' The bounds vector size must be equal to 1 for binomial regression, |
| #' or the number |
| #' of classes for multinomial regression. |
| #' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained |
| #' optimization. |
| #' The bound vector size must be equal to 1 for binomial regression, |
| #' or the number of classes for multinomial regression. |
| #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and |
| #' label column of string type. |
| #' Supported options: "skip" (filter out rows with invalid data), |
| #' "error" (throw an error), "keep" (put invalid data in |
| #' a special additional bucket, at index numLabels). Default |
| #' is "error". |
| #' @param ... additional arguments passed to the method. |
| #' @return \code{spark.logit} returns a fitted logistic regression model. |
| #' @rdname spark.logit |
| #' @aliases spark.logit,SparkDataFrame,formula-method |
| #' @name spark.logit |
| #' @examples |
| #' \dontrun{ |
| #' sparkR.session() |
| #' # binary logistic regression |
| #' t <- as.data.frame(Titanic) |
| #' training <- createDataFrame(t) |
| #' model <- spark.logit(training, Survived ~ ., regParam = 0.5) |
| #' summary <- summary(model) |
| #' |
| #' # fitted values on training data |
| #' fitted <- predict(model, training) |
| #' |
| #' # save fitted model to input path |
| #' path <- "path/to/model" |
| #' write.ml(model, path) |
| #' |
| #' # can also read back the saved model and predict |
| #' # Note that summary deos not work on loaded model |
| #' savedModel <- read.ml(path) |
| #' summary(savedModel) |
| #' |
| #' # binary logistic regression against two classes with |
| #' # upperBoundsOnCoefficients and upperBoundsOnIntercepts |
| #' ubc <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4) |
| #' model <- spark.logit(training, Species ~ ., |
| #' upperBoundsOnCoefficients = ubc, |
| #' upperBoundsOnIntercepts = 1.0) |
| #' |
| #' # multinomial logistic regression |
| #' model <- spark.logit(training, Class ~ ., regParam = 0.5) |
| #' summary <- summary(model) |
| #' |
| #' # multinomial logistic regression with |
| #' # lowerBoundsOnCoefficients and lowerBoundsOnIntercepts |
| #' lbc <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4) |
| #' lbi <- as.array(c(0.0, 0.0)) |
| #' model <- spark.logit(training, Species ~ ., family = "multinomial", |
| #' lowerBoundsOnCoefficients = lbc, |
| #' lowerBoundsOnIntercepts = lbi) |
| #' } |
| #' @note spark.logit since 2.1.0 |
| setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), |
| function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, |
| tol = 1E-6, family = "auto", standardization = TRUE, |
| thresholds = 0.5, weightCol = NULL, aggregationDepth = 2, |
| lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL, |
| lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL, |
| handleInvalid = c("error", "keep", "skip")) { |
| formula <- paste(deparse(formula), collapse = "") |
| row <- 0 |
| col <- 0 |
| |
| if (!is.null(weightCol) && weightCol == "") { |
| weightCol <- NULL |
| } else if (!is.null(weightCol)) { |
| weightCol <- as.character(weightCol) |
| } |
| |
| if (!is.null(lowerBoundsOnIntercepts)) { |
| lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts) |
| } |
| |
| if (!is.null(upperBoundsOnIntercepts)) { |
| upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts) |
| } |
| |
| if (!is.null(lowerBoundsOnCoefficients)) { |
| if (class(lowerBoundsOnCoefficients) != "matrix") { |
| stop("lowerBoundsOnCoefficients must be a matrix.") |
| } |
| row <- nrow(lowerBoundsOnCoefficients) |
| col <- ncol(lowerBoundsOnCoefficients) |
| lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients)) |
| } |
| |
| if (!is.null(upperBoundsOnCoefficients)) { |
| if (class(upperBoundsOnCoefficients) != "matrix") { |
| stop("upperBoundsOnCoefficients must be a matrix.") |
| } |
| |
| if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients) |
| || col != ncol(upperBoundsOnCoefficients))) { |
| stop("dimension of upperBoundsOnCoefficients ", |
| "is not the same as lowerBoundsOnCoefficients") |
| } |
| |
| if (is.null(lowerBoundsOnCoefficients)) { |
| row <- nrow(upperBoundsOnCoefficients) |
| col <- ncol(upperBoundsOnCoefficients) |
| } |
| |
| upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients)) |
| } |
| |
| handleInvalid <- match.arg(handleInvalid) |
| |
| jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", |
| data@sdf, formula, as.numeric(regParam), |
| as.numeric(elasticNetParam), as.integer(maxIter), |
| as.numeric(tol), as.character(family), |
| as.logical(standardization), as.array(thresholds), |
| weightCol, as.integer(aggregationDepth), |
| as.integer(row), as.integer(col), |
| lowerBoundsOnCoefficients, upperBoundsOnCoefficients, |
| lowerBoundsOnIntercepts, upperBoundsOnIntercepts, |
| handleInvalid) |
| new("LogisticRegressionModel", jobj = jobj) |
| }) |
| |
| # Get the summary of an LogisticRegressionModel |
| |
| #' @param object an LogisticRegressionModel fitted by \code{spark.logit}. |
| #' @return \code{summary} returns summary information of the fitted model, which is a list. |
| #' The list includes \code{coefficients} (coefficients matrix of the fitted model). |
| #' @rdname spark.logit |
| #' @aliases summary,LogisticRegressionModel-method |
| #' @note summary(LogisticRegressionModel) since 2.1.0 |
| setMethod("summary", signature(object = "LogisticRegressionModel"), |
| function(object) { |
| jobj <- object@jobj |
| features <- callJMethod(jobj, "rFeatures") |
| labels <- callJMethod(jobj, "labels") |
| coefficients <- callJMethod(jobj, "rCoefficients") |
| nCol <- length(coefficients) / length(features) |
| coefficients <- matrix(unlist(coefficients), ncol = nCol) |
| # If nCol == 1, means this is a binomial logistic regression model with pivoting. |
| # Otherwise, it's a multinomial logistic regression model without pivoting. |
| if (nCol == 1) { |
| colnames(coefficients) <- c("Estimate") |
| } else { |
| colnames(coefficients) <- unlist(labels) |
| } |
| rownames(coefficients) <- unlist(features) |
| |
| list(coefficients = coefficients) |
| }) |
| |
| # Predicted values based on an LogisticRegressionModel model |
| |
| #' @param newData a SparkDataFrame for testing. |
| #' @return \code{predict} returns the predicted values based on an LogisticRegressionModel. |
| #' @rdname spark.logit |
| #' @aliases predict,LogisticRegressionModel,SparkDataFrame-method |
| #' @note predict(LogisticRegressionModel) since 2.1.0 |
| setMethod("predict", signature(object = "LogisticRegressionModel"), |
| function(object, newData) { |
| predict_internal(object, newData) |
| }) |
| |
| # Save fitted LogisticRegressionModel to the input path |
| |
| #' @param path The directory where the model is saved. |
| #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE |
| #' which means throw exception if the output path exists. |
| #' |
| #' @rdname spark.logit |
| #' @aliases write.ml,LogisticRegressionModel,character-method |
| #' @note write.ml(LogisticRegression, character) since 2.1.0 |
| setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"), |
| function(object, path, overwrite = FALSE) { |
| write_internal(object, path, overwrite) |
| }) |
| |
| #' Multilayer Perceptron Classification Model |
| #' |
| #' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame. |
| #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make |
| #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. |
| #' Only categorical data is supported. |
| #' For more details, see |
| #' \href{https://spark.apache.org/docs/latest/ml-classification-regression.html}{ |
| #' Multilayer Perceptron} |
| #' |
| #' @param data a \code{SparkDataFrame} of observations and labels for model fitting. |
| #' @param formula a symbolic description of the model to be fitted. Currently only a few formula |
| #' operators are supported, including '~', '.', ':', '+', and '-'. |
| #' @param blockSize blockSize parameter. |
| #' @param layers integer vector containing the number of nodes for each layer. |
| #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs". |
| #' @param maxIter maximum iteration number. |
| #' @param tol convergence tolerance of iterations. |
| #' @param stepSize stepSize parameter. |
| #' @param seed seed parameter for weights initialization. |
| #' @param initialWeights initialWeights parameter for weights initialization, it should be a |
| #' numeric vector. |
| #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and |
| #' label column of string type. |
| #' Supported options: "skip" (filter out rows with invalid data), |
| #' "error" (throw an error), "keep" (put invalid data in |
| #' a special additional bucket, at index numLabels). Default |
| #' is "error". |
| #' @param ... additional arguments passed to the method. |
| #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model. |
| #' @rdname spark.mlp |
| #' @aliases spark.mlp,SparkDataFrame,formula-method |
| #' @name spark.mlp |
| #' @seealso \link{read.ml} |
| #' @examples |
| #' \dontrun{ |
| #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") |
| #' |
| #' # fit a Multilayer Perceptron Classification Model |
| #' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs", |
| #' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, |
| #' initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) |
| #' |
| #' # get the summary of the model |
| #' summary(model) |
| #' |
| #' # make predictions |
| #' predictions <- predict(model, df) |
| #' |
| #' # save and load the model |
| #' path <- "path/to/model" |
| #' write.ml(model, path) |
| #' savedModel <- read.ml(path) |
| #' summary(savedModel) |
| #' } |
| #' @note spark.mlp since 2.1.0 |
| setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"), |
| function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, |
| tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL, |
| handleInvalid = c("error", "keep", "skip")) { |
| formula <- paste(deparse(formula), collapse = "") |
| if (is.null(layers)) { |
| stop("layers must be a integer vector with length > 1.") |
| } |
| layers <- as.integer(na.omit(layers)) |
| if (length(layers) <= 1) { |
| stop("layers must be a integer vector with length > 1.") |
| } |
| if (!is.null(seed)) { |
| seed <- as.character(as.integer(seed)) |
| } |
| if (!is.null(initialWeights)) { |
| initialWeights <- as.array(as.numeric(na.omit(initialWeights))) |
| } |
| handleInvalid <- match.arg(handleInvalid) |
| jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", |
| "fit", data@sdf, formula, as.integer(blockSize), as.array(layers), |
| as.character(solver), as.integer(maxIter), as.numeric(tol), |
| as.numeric(stepSize), seed, initialWeights, handleInvalid) |
| new("MultilayerPerceptronClassificationModel", jobj = jobj) |
| }) |
| |
| # Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp} |
| |
| #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp} |
| #' @return \code{summary} returns summary information of the fitted model, which is a list. |
| #' The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs} |
| #' (number of outputs), \code{layers} (array of layer sizes including input |
| #' and output layers), and \code{weights} (the weights of layers). |
| #' For \code{weights}, it is a numeric vector with length equal to the expected |
| #' given the architecture (i.e., for 8-10-2 network, 112 connection weights). |
| #' @rdname spark.mlp |
| #' @aliases summary,MultilayerPerceptronClassificationModel-method |
| #' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0 |
| setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"), |
| function(object) { |
| jobj <- object@jobj |
| layers <- unlist(callJMethod(jobj, "layers")) |
| numOfInputs <- head(layers, n = 1) |
| numOfOutputs <- tail(layers, n = 1) |
| weights <- callJMethod(jobj, "weights") |
| list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs, |
| layers = layers, weights = weights) |
| }) |
| |
| # Makes predictions from a model produced by spark.mlp(). |
| |
| #' @param newData a SparkDataFrame for testing. |
| #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named |
| #' "prediction". |
| #' @rdname spark.mlp |
| #' @aliases predict,MultilayerPerceptronClassificationModel-method |
| #' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0 |
| setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"), |
| function(object, newData) { |
| predict_internal(object, newData) |
| }) |
| |
| # Saves the Multilayer Perceptron Classification Model to the input path. |
| |
| #' @param path the directory where the model is saved. |
| #' @param overwrite overwrites or not if the output path already exists. Default is FALSE |
| #' which means throw exception if the output path exists. |
| #' |
| #' @rdname spark.mlp |
| #' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method |
| #' @seealso \link{write.ml} |
| #' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0 |
| setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel", |
| path = "character"), |
| function(object, path, overwrite = FALSE) { |
| write_internal(object, path, overwrite) |
| }) |
| |
| #' Naive Bayes Models |
| #' |
| #' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame. |
| #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make |
| #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. |
| #' Only categorical data is supported. |
| #' |
| #' @param data a \code{SparkDataFrame} of observations and labels for model fitting. |
| #' @param formula a symbolic description of the model to be fitted. Currently only a few formula |
| #' operators are supported, including '~', '.', ':', '+', and '-'. |
| #' @param smoothing smoothing parameter. |
| #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and |
| #' label column of string type. |
| #' Supported options: "skip" (filter out rows with invalid data), |
| #' "error" (throw an error), "keep" (put invalid data in |
| #' a special additional bucket, at index numLabels). Default |
| #' is "error". |
| #' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}. |
| #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model. |
| #' @rdname spark.naiveBayes |
| #' @aliases spark.naiveBayes,SparkDataFrame,formula-method |
| #' @name spark.naiveBayes |
| #' @seealso e1071: \url{https://cran.r-project.org/package=e1071} |
| #' @examples |
| #' \dontrun{ |
| #' data <- as.data.frame(UCBAdmissions) |
| #' df <- createDataFrame(data) |
| #' |
| #' # fit a Bernoulli naive Bayes model |
| #' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0) |
| #' |
| #' # get the summary of the model |
| #' summary(model) |
| #' |
| #' # make predictions |
| #' predictions <- predict(model, df) |
| #' |
| #' # save and load the model |
| #' path <- "path/to/model" |
| #' write.ml(model, path) |
| #' savedModel <- read.ml(path) |
| #' summary(savedModel) |
| #' } |
| #' @note spark.naiveBayes since 2.0.0 |
| setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"), |
| function(data, formula, smoothing = 1.0, |
| handleInvalid = c("error", "keep", "skip")) { |
| formula <- paste(deparse(formula), collapse = "") |
| handleInvalid <- match.arg(handleInvalid) |
| jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", |
| formula, data@sdf, smoothing, handleInvalid) |
| new("NaiveBayesModel", jobj = jobj) |
| }) |
| |
| # Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes} |
| |
| #' @param object a naive Bayes model fitted by \code{spark.naiveBayes}. |
| #' @return \code{summary} returns summary information of the fitted model, which is a list. |
| #' The list includes \code{apriori} (the label distribution) and |
| #' \code{tables} (conditional probabilities given the target label). |
| #' @rdname spark.naiveBayes |
| #' @note summary(NaiveBayesModel) since 2.0.0 |
| setMethod("summary", signature(object = "NaiveBayesModel"), |
| function(object) { |
| jobj <- object@jobj |
| features <- callJMethod(jobj, "features") |
| labels <- callJMethod(jobj, "labels") |
| apriori <- callJMethod(jobj, "apriori") |
| apriori <- t(as.matrix(unlist(apriori))) |
| colnames(apriori) <- unlist(labels) |
| tables <- callJMethod(jobj, "tables") |
| tables <- matrix(tables, nrow = length(labels)) |
| rownames(tables) <- unlist(labels) |
| colnames(tables) <- unlist(features) |
| list(apriori = apriori, tables = tables) |
| }) |
| |
| # Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(), |
| # similarly to R package e1071's predict. |
| |
| #' @param newData a SparkDataFrame for testing. |
| #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named |
| #' "prediction". |
| #' @rdname spark.naiveBayes |
| #' @note predict(NaiveBayesModel) since 2.0.0 |
| setMethod("predict", signature(object = "NaiveBayesModel"), |
| function(object, newData) { |
| predict_internal(object, newData) |
| }) |
| |
| # Saves the Bernoulli naive Bayes model to the input path. |
| |
| #' @param path the directory where the model is saved. |
| #' @param overwrite overwrites or not if the output path already exists. Default is FALSE |
| #' which means throw exception if the output path exists. |
| #' |
| #' @rdname spark.naiveBayes |
| #' @seealso \link{write.ml} |
| #' @note write.ml(NaiveBayesModel, character) since 2.0.0 |
| setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), |
| function(object, path, overwrite = FALSE) { |
| write_internal(object, path, overwrite) |
| }) |
| |
| #' Factorization Machines Classification Model |
| #' |
| #' \code{spark.fmClassifier} fits a factorization classification model against a SparkDataFrame. |
| #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make |
| #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models. |
| #' Only categorical data is supported. |
| #' |
| #' @param data a \code{SparkDataFrame} of observations and labels for model fitting. |
| #' @param formula a symbolic description of the model to be fitted. Currently only a few formula |
| #' operators are supported, including '~', '.', ':', '+', and '-'. |
| #' @param factorSize dimensionality of the factors. |
| #' @param fitLinear whether to fit linear term. # TODO Can we express this with formula? |
| #' @param regParam the regularization parameter. |
| #' @param miniBatchFraction the mini-batch fraction parameter. |
| #' @param initStd the standard deviation of initial coefficients. |
| #' @param maxIter maximum iteration number. |
| #' @param stepSize stepSize parameter. |
| #' @param tol convergence tolerance of iterations. |
| #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "adamW". |
| #' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of |
| #' class label 1 is > threshold, then predict 1, else 0. A high threshold |
| #' encourages the model to predict 0 more often; a low threshold encourages the |
| #' model to predict 1 more often. Note: Setting this with threshold p is |
| #' equivalent to setting thresholds c(1-p, p). |
| #' @param seed seed parameter for weights initialization. |
| #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and |
| #' label column of string type. |
| #' Supported options: "skip" (filter out rows with invalid data), |
| #' "error" (throw an error), "keep" (put invalid data in |
| #' a special additional bucket, at index numLabels). Default |
| #' is "error". |
| #' @param ... additional arguments passed to the method. |
| #' @return \code{spark.fmClassifier} returns a fitted Factorization Machines Classification Model. |
| #' @rdname spark.fmClassifier |
| #' @aliases spark.fmClassifier,SparkDataFrame,formula-method |
| #' @name spark.fmClassifier |
| #' @seealso \link{read.ml} |
| #' @examples |
| #' \dontrun{ |
| #' df <- read.df("data/mllib/sample_binary_classification_data.txt", source = "libsvm") |
| #' |
| #' # fit Factorization Machines Classification Model |
| #' model <- spark.fmClassifier( |
| #' df, label ~ features, |
| #' regParam = 0.01, maxIter = 10, fitLinear = TRUE |
| #' ) |
| #' |
| #' # get the summary of the model |
| #' summary(model) |
| #' |
| #' # make predictions |
| #' predictions <- predict(model, df) |
| #' |
| #' # save and load the model |
| #' path <- "path/to/model" |
| #' write.ml(model, path) |
| #' savedModel <- read.ml(path) |
| #' summary(savedModel) |
| #' } |
| #' @note spark.fmClassifier since 3.1.0 |
| setMethod("spark.fmClassifier", signature(data = "SparkDataFrame", formula = "formula"), |
| function(data, formula, factorSize = 8, fitLinear = TRUE, regParam = 0.0, |
| miniBatchFraction = 1.0, initStd = 0.01, maxIter = 100, stepSize=1.0, |
| tol = 1e-6, solver = c("adamW", "gd"), thresholds = NULL, seed = NULL, |
| handleInvalid = c("error", "keep", "skip")) { |
| |
| formula <- paste(deparse(formula), collapse = "") |
| |
| if (!is.null(seed)) { |
| seed <- as.character(as.integer(seed)) |
| } |
| |
| if (!is.null(thresholds)) { |
| thresholds <- as.list(thresholds) |
| } |
| |
| solver <- match.arg(solver) |
| handleInvalid <- match.arg(handleInvalid) |
| |
| jobj <- callJStatic("org.apache.spark.ml.r.FMClassifierWrapper", |
| "fit", |
| data@sdf, |
| formula, |
| as.integer(factorSize), |
| as.logical(fitLinear), |
| as.numeric(regParam), |
| as.numeric(miniBatchFraction), |
| as.numeric(initStd), |
| as.integer(maxIter), |
| as.numeric(stepSize), |
| as.numeric(tol), |
| solver, |
| seed, |
| thresholds, |
| handleInvalid) |
| new("FMClassificationModel", jobj = jobj) |
| }) |
| |
| # Returns the summary of a FM Classification model produced by \code{spark.fmClassifier} |
| |
| #' @param object a FM Classification model fitted by \code{spark.fmClassifier}. |
| #' @return \code{summary} returns summary information of the fitted model, which is a list. |
| #' @rdname spark.fmClassifier |
| #' @note summary(FMClassificationModel) since 3.1.0 |
| setMethod("summary", signature(object = "FMClassificationModel"), |
| function(object) { |
| jobj <- object@jobj |
| features <- callJMethod(jobj, "rFeatures") |
| coefficients <- callJMethod(jobj, "rCoefficients") |
| coefficients <- as.matrix(unlist(coefficients)) |
| colnames(coefficients) <- c("Estimate") |
| rownames(coefficients) <- unlist(features) |
| numClasses <- callJMethod(jobj, "numClasses") |
| numFeatures <- callJMethod(jobj, "numFeatures") |
| raw_factors <- unlist(callJMethod(jobj, "rFactors")) |
| factor_size <- callJMethod(jobj, "factorSize") |
| |
| list( |
| coefficients = coefficients, |
| factors = matrix(raw_factors, ncol = factor_size), |
| numClasses = numClasses, numFeatures = numFeatures, |
| factorSize = factor_size |
| ) |
| }) |
| |
| # Predicted values based on an FMClassificationModel model |
| |
| #' @param newData a SparkDataFrame for testing. |
| #' @return \code{predict} returns the predicted values based on a FM Classification model. |
| #' @rdname spark.fmClassifier |
| #' @aliases predict,FMClassificationModel,SparkDataFrame-method |
| #' @note predict(FMClassificationModel) since 3.1.0 |
| setMethod("predict", signature(object = "FMClassificationModel"), |
| function(object, newData) { |
| predict_internal(object, newData) |
| }) |
| |
| # Save fitted FMClassificationModel to the input path |
| |
| #' @param path The directory where the model is saved. |
| #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE |
| #' which means throw exception if the output path exists. |
| #' |
| #' @rdname spark.fmClassifier |
| #' @aliases write.ml,FMClassificationModel,character-method |
| #' @note write.ml(FMClassificationModel, character) since 3.1.0 |
| setMethod("write.ml", signature(object = "FMClassificationModel", path = "character"), |
| function(object, path, overwrite = FALSE) { |
| write_internal(object, path, overwrite) |
| }) |