blob: f82fb589bb5a5e644b7d994a34fa13910d1ffdc8 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# mllib_stat.R: Provides methods for MLlib statistics algorithms integration
#' S4 class that represents an KSTest
#'
#' @param jobj a Java object reference to the backing Scala KSTestWrapper
#' @note KSTest since 2.1.0
setClass("KSTest", representation(jobj = "jobj"))
#' (One-Sample) Kolmogorov-Smirnov Test
#'
#' @description
#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
#' continuous distribution.
#'
#' By comparing the largest difference between the empirical cumulative
#' distribution of the sample data and the theoretical distribution we can provide a test for the
#' the null hypothesis that the sample data comes from that theoretical distribution.
#'
#' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest}
#' to print out a summary result.
#'
#' @param data a SparkDataFrame of user data.
#' @param testCol column name where the test data is from. It should be a column of double type.
#' @param nullHypothesis name of the theoretical distribution tested against. Currently only
#' \code{"norm"} for normal distribution is supported.
#' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"},
#' we can provide as a vector the mean and standard deviation of
#' the distribution. If none is provided, then standard normal will be used.
#' If only one is provided, then the standard deviation will be set to be one.
#' @param ... additional argument(s) passed to the method.
#' @return \code{spark.kstest} returns a test result object.
#' @rdname spark.kstest
#' @aliases spark.kstest,SparkDataFrame-method
#' @name spark.kstest
#' @seealso \href{https://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
#' MLlib: Hypothesis Testing}
#' @examples
#' \dontrun{
#' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
#' df <- createDataFrame(data)
#' test <- spark.kstest(df, "test", "norm", c(0, 1))
#'
#' # get a summary of the test result
#' testSummary <- summary(test)
#' testSummary
#'
#' # print out the summary in an organized way
#' print.summary.KSTest(testSummary)
#' }
#' @note spark.kstest since 2.1.0
setMethod("spark.kstest", signature(data = "SparkDataFrame"),
function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) {
tryCatch(match.arg(nullHypothesis),
error = function(e) {
stop("Distribution ", nullHypothesis, " is not supported.")
})
if (nullHypothesis == "norm") {
distParams <- as.numeric(distParams)
mu <- ifelse(length(distParams) < 1, 0, distParams[1])
sigma <- ifelse(length(distParams) < 2, 1, distParams[2])
jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper",
"test", data@sdf, testCol, nullHypothesis,
as.array(c(mu, sigma)))
new("KSTest", jobj = jobj)
}
})
# Get the summary of Kolmogorov-Smirnov (KS) Test.
#' @param object test result object of KSTest by \code{spark.kstest}.
#' @return \code{summary} returns summary information of KSTest object, which is a list.
#' The list includes the \code{p.value} (p-value), \code{statistic} (test statistic
#' computed for the test), \code{nullHypothesis} (the null hypothesis with its
#' parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test).
#' @rdname spark.kstest
#' @aliases summary,KSTest-method
#' @note summary(KSTest) since 2.1.0
setMethod("summary", signature(object = "KSTest"),
function(object) {
jobj <- object@jobj
pValue <- callJMethod(jobj, "pValue")
statistic <- callJMethod(jobj, "statistic")
nullHypothesis <- callJMethod(jobj, "nullHypothesis")
distName <- callJMethod(jobj, "distName")
distParams <- unlist(callJMethod(jobj, "distParams"))
degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
degreesOfFreedom = degreesOfFreedom, jobj = jobj)
class(ans) <- "summary.KSTest"
ans
})
# Prints the summary of KSTest
#' @rdname spark.kstest
#' @param x summary object of KSTest returned by \code{summary}.
#' @note print.summary.KSTest since 2.1.0
print.summary.KSTest <- function(x, ...) {
jobj <- x$jobj
summaryStr <- callJMethod(jobj, "summary")
cat(summaryStr, "\n")
invisible(x)
}