| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| |
| # mllib_stat.R: Provides methods for MLlib statistics algorithms integration |
| |
| #' S4 class that represents an KSTest |
| #' |
| #' @param jobj a Java object reference to the backing Scala KSTestWrapper |
| #' @note KSTest since 2.1.0 |
| setClass("KSTest", representation(jobj = "jobj")) |
| |
| #' (One-Sample) Kolmogorov-Smirnov Test |
| #' |
| #' @description |
| #' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a |
| #' continuous distribution. |
| #' |
| #' By comparing the largest difference between the empirical cumulative |
| #' distribution of the sample data and the theoretical distribution we can provide a test for the |
| #' the null hypothesis that the sample data comes from that theoretical distribution. |
| #' |
| #' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest} |
| #' to print out a summary result. |
| #' |
| #' @param data a SparkDataFrame of user data. |
| #' @param testCol column name where the test data is from. It should be a column of double type. |
| #' @param nullHypothesis name of the theoretical distribution tested against. Currently only |
| #' \code{"norm"} for normal distribution is supported. |
| #' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"}, |
| #' we can provide as a vector the mean and standard deviation of |
| #' the distribution. If none is provided, then standard normal will be used. |
| #' If only one is provided, then the standard deviation will be set to be one. |
| #' @param ... additional argument(s) passed to the method. |
| #' @return \code{spark.kstest} returns a test result object. |
| #' @rdname spark.kstest |
| #' @aliases spark.kstest,SparkDataFrame-method |
| #' @name spark.kstest |
| #' @seealso \href{https://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{ |
| #' MLlib: Hypothesis Testing} |
| #' @examples |
| #' \dontrun{ |
| #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25)) |
| #' df <- createDataFrame(data) |
| #' test <- spark.kstest(df, "test", "norm", c(0, 1)) |
| #' |
| #' # get a summary of the test result |
| #' testSummary <- summary(test) |
| #' testSummary |
| #' |
| #' # print out the summary in an organized way |
| #' print.summary.KSTest(testSummary) |
| #' } |
| #' @note spark.kstest since 2.1.0 |
| setMethod("spark.kstest", signature(data = "SparkDataFrame"), |
| function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { |
| tryCatch(match.arg(nullHypothesis), |
| error = function(e) { |
| stop("Distribution ", nullHypothesis, " is not supported.") |
| }) |
| if (nullHypothesis == "norm") { |
| distParams <- as.numeric(distParams) |
| mu <- ifelse(length(distParams) < 1, 0, distParams[1]) |
| sigma <- ifelse(length(distParams) < 2, 1, distParams[2]) |
| jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper", |
| "test", data@sdf, testCol, nullHypothesis, |
| as.array(c(mu, sigma))) |
| new("KSTest", jobj = jobj) |
| } |
| }) |
| |
| # Get the summary of Kolmogorov-Smirnov (KS) Test. |
| |
| #' @param object test result object of KSTest by \code{spark.kstest}. |
| #' @return \code{summary} returns summary information of KSTest object, which is a list. |
| #' The list includes the \code{p.value} (p-value), \code{statistic} (test statistic |
| #' computed for the test), \code{nullHypothesis} (the null hypothesis with its |
| #' parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test). |
| #' @rdname spark.kstest |
| #' @aliases summary,KSTest-method |
| #' @note summary(KSTest) since 2.1.0 |
| setMethod("summary", signature(object = "KSTest"), |
| function(object) { |
| jobj <- object@jobj |
| pValue <- callJMethod(jobj, "pValue") |
| statistic <- callJMethod(jobj, "statistic") |
| nullHypothesis <- callJMethod(jobj, "nullHypothesis") |
| distName <- callJMethod(jobj, "distName") |
| distParams <- unlist(callJMethod(jobj, "distParams")) |
| degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom") |
| |
| ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis, |
| nullHypothesis.name = distName, nullHypothesis.parameters = distParams, |
| degreesOfFreedom = degreesOfFreedom, jobj = jobj) |
| class(ans) <- "summary.KSTest" |
| ans |
| }) |
| |
| # Prints the summary of KSTest |
| |
| #' @rdname spark.kstest |
| #' @param x summary object of KSTest returned by \code{summary}. |
| #' @note print.summary.KSTest since 2.1.0 |
| print.summary.KSTest <- function(x, ...) { |
| jobj <- x$jobj |
| summaryStr <- callJMethod(jobj, "summary") |
| cat(summaryStr, "\n") |
| invisible(x) |
| } |