| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| #' Define Partitioning for a Dataset |
| #' |
| #' @description |
| #' Pass a `Partitioning` object to a [FileSystemDatasetFactory]'s `$create()` |
| #' method to indicate how the file's paths should be interpreted to define |
| #' partitioning. |
| #' |
| #' `DirectoryPartitioning` describes how to interpret raw path segments, in |
| #' order. For example, `schema(year = int16(), month = int8())` would define |
| #' partitions for file paths like "2019/01/file.parquet", |
| #' "2019/02/file.parquet", etc. In this scheme `NULL` values will be skipped. In |
| #' the previous example: when writing a dataset if the month was `NA` (or |
| #' `NULL`), the files would be placed in "2019/file.parquet". When reading, the |
| #' rows in "2019/file.parquet" would return an `NA` for the month column. An |
| #' error will be raised if an outer directory is `NULL` and an inner directory |
| #' is not. |
| #' |
| #' `HivePartitioning` is for Hive-style partitioning, which embeds field |
| #' names and values in path segments, such as |
| #' "/year=2019/month=2/data.parquet". Because fields are named in the path |
| #' segments, order does not matter. This partitioning scheme allows `NULL` |
| #' values. They will be replaced by a configurable `null_fallback` which |
| #' defaults to the string `"__HIVE_DEFAULT_PARTITION__"` when writing. When |
| #' reading, the `null_fallback` string will be replaced with `NA`s as |
| #' appropriate. |
| #' |
| #' `PartitioningFactory` subclasses instruct the `DatasetFactory` to detect |
| #' partition features from the file paths. |
| #' @section Factory: |
| #' Both `DirectoryPartitioning$create()` and `HivePartitioning$create()` |
| #' methods take a [Schema] as a single input argument. The helper |
| #' function [`hive_partition(...)`][hive_partition] is shorthand for |
| #' `HivePartitioning$create(schema(...))`. |
| #' |
| #' With `DirectoryPartitioningFactory$create()`, you can provide just the |
| #' names of the path segments (in our example, `c("year", "month")`), and |
| #' the `DatasetFactory` will infer the data types for those partition variables. |
| #' `HivePartitioningFactory$create()` takes no arguments: both variable names |
| #' and their types can be inferred from the file paths. `hive_partition()` with |
| #' no arguments returns a `HivePartitioningFactory`. |
| #' @name Partitioning |
| #' @rdname Partitioning |
| #' @export |
| Partitioning <- R6Class("Partitioning", inherit = ArrowObject) |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname Partitioning |
| #' @export |
| DirectoryPartitioning <- R6Class("DirectoryPartitioning", inherit = Partitioning) |
| DirectoryPartitioning$create <- function(schm, segment_encoding = "uri") { |
| dataset___DirectoryPartitioning(schm, segment_encoding = segment_encoding) |
| } |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname Partitioning |
| #' @export |
| HivePartitioning <- R6Class("HivePartitioning", inherit = Partitioning) |
| HivePartitioning$create <- function(schm, null_fallback = NULL, segment_encoding = "uri") { |
| dataset___HivePartitioning(schm, |
| null_fallback = null_fallback_or_default(null_fallback), |
| segment_encoding = segment_encoding) |
| } |
| |
| #' Construct Hive partitioning |
| #' |
| #' Hive partitioning embeds field names and values in path segments, such as |
| #' "/year=2019/month=2/data.parquet". |
| #' |
| #' Because fields are named in the path segments, order of fields passed to |
| #' `hive_partition()` does not matter. |
| #' @param ... named list of [data types][data-type], passed to [schema()] |
| #' @param null_fallback character to be used in place of missing values (`NA` or `NULL`) |
| #' in partition columns. Default is `"__HIVE_DEFAULT_PARTITION__"`, |
| #' which is what Hive uses. |
| #' @param segment_encoding Decode partition segments after splitting paths. |
| #' Default is `"uri"` (URI-decode segments). May also be `"none"` (leave as-is). |
| #' @return A [HivePartitioning][Partitioning], or a `HivePartitioningFactory` if |
| #' calling `hive_partition()` with no arguments. |
| #' @examplesIf arrow_with_dataset() |
| #' hive_partition(year = int16(), month = int8()) |
| #' @export |
| hive_partition <- function(..., null_fallback = NULL, segment_encoding = "uri") { |
| schm <- schema(...) |
| if (length(schm) == 0) { |
| HivePartitioningFactory$create(null_fallback, segment_encoding) |
| } else { |
| HivePartitioning$create(schm, null_fallback, segment_encoding) |
| } |
| } |
| |
| PartitioningFactory <- R6Class("PartitioningFactory", inherit = ArrowObject) |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname Partitioning |
| #' @export |
| DirectoryPartitioningFactory <- R6Class("DirectoryPartitioningFactory ", inherit = PartitioningFactory) |
| DirectoryPartitioningFactory$create <- function(field_names, segment_encoding = "uri") { |
| dataset___DirectoryPartitioning__MakeFactory(field_names, segment_encoding) |
| } |
| |
| #' @usage NULL |
| #' @format NULL |
| #' @rdname Partitioning |
| #' @export |
| HivePartitioningFactory <- R6Class("HivePartitioningFactory", inherit = PartitioningFactory) |
| HivePartitioningFactory$create <- function(null_fallback = NULL, segment_encoding = "uri") { |
| dataset___HivePartitioning__MakeFactory(null_fallback_or_default(null_fallback), segment_encoding) |
| } |
| |
| null_fallback_or_default <- function(null_fallback) { |
| null_fallback %||% "__HIVE_DEFAULT_PARTITION__" |
| } |