blob: 7044ede0cc58b05dbeeee7b7ba2f8e9d5bda87e8 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A set of S3 classes and methods that support the SparkSQL `StructType` and `StructField
# datatypes. These are used to create and interact with SparkDataFrame schemas.
#' structType
#'
#' Create a structType object that contains the metadata for a SparkDataFrame. Intended for
#' use with createDataFrame and toDF.
#'
#' @param x a structField object (created with the \code{structField} method). Since Spark 2.3,
#' this can be a DDL-formatted string, which is a comma separated list of field
#' definitions, e.g., "a INT, b STRING".
#' @param ... additional structField objects
#' @return a structType object
#' @rdname structType
#' @examples
#'\dontrun{
#' schema <- structType(structField("a", "integer"), structField("c", "string"),
#' structField("avg", "double"))
#' df1 <- gapply(df, list("a", "c"),
#' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
#' schema)
#' schema <- structType("a INT, c STRING, avg DOUBLE")
#' df1 <- gapply(df, list("a", "c"),
#' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
#' schema)
#' }
#' @note structType since 1.4.0
structType <- function(x, ...) {
UseMethod("structType", x)
}
#' @rdname structType
#' @method structType jobj
structType.jobj <- function(x, ...) {
obj <- structure(list(), class = "structType")
obj$jobj <- x
obj$fields <- function() { lapply(callJMethod(obj$jobj, "fields"), structField) }
obj
}
#' @rdname structType
#' @method structType structField
structType.structField <- function(x, ...) {
fields <- list(x, ...)
if (!all(sapply(fields, inherits, "structField"))) {
stop("All arguments must be structField objects.")
}
sfObjList <- lapply(fields, function(field) {
field$jobj
})
stObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
"createStructType",
sfObjList)
structType(stObj)
}
#' @rdname structType
#' @method structType character
structType.character <- function(x, ...) {
if (!is.character(x)) {
stop("schema must be a DDL-formatted string.")
}
if (length(list(...)) > 0) {
stop("multiple DDL-formatted strings are not supported")
}
stObj <- handledCallJStatic("org.apache.spark.sql.types.StructType",
"fromDDL",
x)
structType(stObj)
}
#' Print a Spark StructType.
#'
#' This function prints the contents of a StructType returned from the
#' SparkR JVM backend.
#'
#' @param x A StructType object
#' @param ... further arguments passed to or from other methods
#' @note print.structType since 1.4.0
print.structType <- function(x, ...) {
cat("StructType\n",
sapply(x$fields(),
function(field) {
paste0("|-", "name = \"", field$name(),
"\", type = \"", field$dataType.toString(),
"\", nullable = ", field$nullable(), "\n")
}),
sep = "")
}
#' structField
#'
#' Create a structField object that contains the metadata for a single field in a schema.
#'
#' @param x the name of the field.
#' @param ... additional argument(s) passed to the method.
#' @return A structField object.
#' @rdname structField
#' @examples
#'\dontrun{
#' field1 <- structField("a", "integer")
#' field2 <- structField("c", "string")
#' field3 <- structField("avg", "double")
#' schema <- structType(field1, field2, field3)
#' df1 <- gapply(df, list("a", "c"),
#' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
#' schema)
#' }
#' @note structField since 1.4.0
structField <- function(x, ...) {
UseMethod("structField", x)
}
#' @rdname structField
#' @method structField jobj
structField.jobj <- function(x, ...) {
obj <- structure(list(), class = "structField")
obj$jobj <- x
obj$name <- function() { callJMethod(x, "name") }
obj$dataType <- function() { callJMethod(x, "dataType") }
obj$dataType.toString <- function() { callJMethod(obj$dataType(), "toString") }
obj$dataType.simpleString <- function() { callJMethod(obj$dataType(), "simpleString") }
obj$nullable <- function() { callJMethod(x, "nullable") }
obj
}
checkType <- function(type) {
if (!is.null(PRIMITIVE_TYPES[[type]])) {
return()
} else {
# Check complex types
firstChar <- substr(type, 1, 1)
switch(firstChar,
a = {
# Array type
m <- regexec("^array<(.+)>$", type)
matchedStrings <- regmatches(type, m)
if (length(matchedStrings[[1]]) >= 2) {
elemType <- matchedStrings[[1]][2]
checkType(elemType)
return()
}
},
m = {
# Map type
m <- regexec("^map<(.+),(.+)>$", type)
matchedStrings <- regmatches(type, m)
if (length(matchedStrings[[1]]) >= 3) {
keyType <- matchedStrings[[1]][2]
if (keyType != "string" && keyType != "character") {
stop("Key type in a map must be string or character")
}
valueType <- matchedStrings[[1]][3]
checkType(valueType)
return()
}
},
s = {
# Struct type
m <- regexec("^struct<(.+)>$", type)
matchedStrings <- regmatches(type, m)
if (length(matchedStrings[[1]]) >= 2) {
fieldsString <- matchedStrings[[1]][2]
# strsplit does not return the final empty string, so check if
# the final char is ","
if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
for (field in fields) {
m <- regexec("^(.+):(.+)$", field)
matchedStrings <- regmatches(field, m)
if (length(matchedStrings[[1]]) >= 3) {
fieldType <- matchedStrings[[1]][3]
checkType(fieldType)
} else {
break
}
}
return()
}
}
})
}
stop("Unsupported type for SparkDataframe: ", type)
}
#' @param type The data type of the field
#' @param nullable A logical vector indicating whether or not the field is nullable
#' @rdname structField
structField.character <- function(x, type, nullable = TRUE, ...) {
if (class(x) != "character") {
stop("Field name must be a string.")
}
if (class(type) != "character") {
stop("Field type must be a string.")
}
if (class(nullable) != "logical") {
stop("nullable must be either TRUE or FALSE")
}
checkType(type)
sfObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
"createStructField",
x,
type,
nullable)
structField(sfObj)
}
#' Print a Spark StructField.
#'
#' This function prints the contents of a StructField returned from the
#' SparkR JVM backend.
#'
#' @param x A StructField object
#' @param ... further arguments passed to or from other methods
#' @note print.structField since 1.4.0
print.structField <- function(x, ...) {
cat("StructField(name = \"", x$name(),
"\", type = \"", x$dataType.toString(),
"\", nullable = ", x$nullable(),
")",
sep = "")
}