blob: 6e29e4ea31ca0e9bba34ccb8eda90a71d9cfc202 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Define Partitioning for a Dataset
#'
#' @description
#' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()`
#' method to indicate how the file's paths should be interpreted to define
#' partitioning.
#'
#' `DirectoryPartitioning` describes how to interpret raw path segments, in
#' order. For example, `schema(year = int16(), month = int8())` would define
#' partitions for file paths like "2019/01/file.parquet",
#' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In
#' the previous example: when writing a dataset if the month was `NA` (or
#' `NULL`), the files would be placed in "2019/file.parquet". When reading, the
#' rows in "2019/file.parquet" would return an `NA` for the month column. An
#' error will be raised if an outer directory is `NULL` and an inner directory
#' is not.
#'
#' `HivePartitioning` is for Hive-style partitioning, which embeds field
#' names and values in path segments, such as
#' "/year=2019/month=2/data.parquet". Because fields are named in the path
#' segments, order does not matter. This partitioning scheme allows `NULL`
#' values. They will be replaced by a configurable `null_fallback` which
#' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When
#' reading, the `null_fallback` string will be replaced with `NA`s as
#' appropriate.
#'
#' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect
#' partition features from the file paths.
#' @section Factory:
#' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()`
#' methods take a [Schema] as a single input argument. The helper
#' function [`hive_partition(...)`][hive_partition] is shorthand for
#' `HivePartitioning$create(schema(...))`.
#'
#' With `DirectoryPartitioningFactory$create()`, you can provide just the
#' names of the path segments (in our example, `c("year", "month")`), and
#' the `DatasetFactory` will infer the data types for those partition variables.
#' `HivePartitioningFactory$create()` takes no arguments: both variable names
#' and their types can be inferred from the file paths. `hive_partition()` with
#' no arguments returns a `HivePartitioningFactory`.
#' @name Partitioning
#' @rdname Partitioning
#' @export
Partitioning <- R6Class("Partitioning", inherit = ArrowObject)
#' @usage NULL
#' @format NULL
#' @rdname Partitioning
#' @export
DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning)
DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") {
dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding)
}
#' @usage NULL
#' @format NULL
#' @rdname Partitioning
#' @export
HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning)
HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") {
dataset___HivePartitioning(schm,
null_fallback = null_fallback_or_default(null_fallback),
segment_encoding = segment_encoding)
}
#' Construct Hive partitioning
#'
#' Hive partitioning embeds field names and values in path segments, such as
#' "/year=2019/month=2/data.parquet".
#'
#' Because fields are named in the path segments, order of fields passed to
#' `hive_partition()` does not matter.
#' @param ... named list of [data types][data-type], passed to [schema()]
#' @param null_fallback character to be used in place of missing values (`NA` or `NULL`)
#' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`,
#' which is what Hive uses.
#' @param segment_encoding Decode partition segments after splitting paths.
#' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is).
#' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if
#' calling `hive_partition()` with no arguments.
#' @examplesIf arrow_with_dataset()
#' hive_partition(year = int16(), month = int8())
#' @export
hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") {
schm <- schema(...)
if (length(schm) == 0) {
HivePartitioningFactory$create(null_fallback, segment_encoding)
} else {
HivePartitioning$create(schm, null_fallback, segment_encoding)
}
}
PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject)
#' @usage NULL
#' @format NULL
#' @rdname Partitioning
#' @export
DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory)
DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") {
dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding)
}
#' @usage NULL
#' @format NULL
#' @rdname Partitioning
#' @export
HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory)
HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") {
dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding)
}
null_fallback_or_default <- function(null_fallback) {
null_fallback %||% "__HIVE_DEFAULT_PARTITION__"
}