blob: 0e029cb74bdb43ccb44fff71ae4e93311206d237 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' @include dataset.R
#' @usage NULL
#' @format NULL
#' @rdname Dataset
#' @export
DatasetFactory <- R6Class("DatasetFactory", inherit = ArrowObject,
public = list(
Finish = function(schema = NULL, unify_schemas = FALSE) {
if (is.null(schema)) {
dataset___DatasetFactory__Finish1(self, unify_schemas)
} else {
assert_is(schema, "Schema")
dataset___DatasetFactory__Finish2(self, schema)
}
},
Inspect = function(unify_schemas = FALSE) {
dataset___DatasetFactory__Inspect(self, unify_schemas)
}
)
)
DatasetFactory$create <- function(x,
filesystem = NULL,
format = c("parquet", "arrow", "ipc", "feather", "csv", "tsv", "text"),
partitioning = NULL,
...) {
if (is_list_of(x, "DatasetFactory")) {
return(dataset___UnionDatasetFactory__Make(x))
}
if (is.character(format)) {
format <- FileFormat$create(match.arg(format), ...)
} else {
assert_is(format, "FileFormat")
}
path_and_fs <- get_paths_and_filesystem(x, filesystem)
info <- path_and_fs$fs$GetFileInfo(path_and_fs$path)
if (length(info) > 1 || info[[1]]$type == FileType$File) {
# x looks like a vector of one or more file paths (not a directory path)
return(FileSystemDatasetFactory$create(path_and_fs$fs, NULL, path_and_fs$path, format))
}
if (!is.null(partitioning)) {
if (inherits(partitioning, "Schema")) {
partitioning <- DirectoryPartitioning$create(partitioning)
} else if (is.character(partitioning)) {
# These are the column/field names, and we should autodetect their types
partitioning <- DirectoryPartitioningFactory$create(partitioning)
}
}
selector <- FileSelector$create(path_and_fs$path, allow_not_found = FALSE, recursive = TRUE)
FileSystemDatasetFactory$create(path_and_fs$fs, selector, NULL, format, partitioning)
}
#' Create a DatasetFactory
#'
#' A [Dataset] can constructed using one or more [DatasetFactory]s.
#' This function helps you construct a `DatasetFactory` that you can pass to
#' [open_dataset()].
#'
#' If you would only have a single `DatasetFactory` (for example, you have a
#' single directory containing Parquet files), you can call `open_dataset()`
#' directly. Use `dataset_factory()` when you
#' want to combine different directories, file systems, or file formats.
#'
#' @param x A string path to a directory containing data files, a vector of one
#' one or more string paths to data files, or a list of `DatasetFactory` objects
#' whose datasets should be combined. If this argument is specified it will be
#' used to construct a `UnionDatasetFactory` and other arguments will be
#' ignored.
#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will
#' be detected from `x`
#' @param format A [FileFormat] object, or a string identifier of the format of
#' the files in `x`. Currently supported values:
#' * "parquet"
#' * "ipc"/"arrow"/"feather", all aliases for each other; for Feather, note that
#' only version 2 files are supported
#' * "csv"/"text", aliases for the same thing (because comma is the default
#' delimiter for text files
#' * "tsv", equivalent to passing `format = "text", delimiter = "\t"`
#'
#' Default is "parquet", unless a `delimiter` is also specified, in which case
#' it is assumed to be "text".
#' @param partitioning One of
#' * A `Schema`, in which case the file paths relative to `sources` will be
#' parsed, and path segments will be matched with the schema fields. For
#' example, `schema(year = int16(), month = int8())` would create partitions
#' for file paths like "2019/01/file.parquet", "2019/02/file.parquet", etc.
#' * A character vector that defines the field names corresponding to those
#' path segments (that is, you're providing the names that would correspond
#' to a `Schema` but the types will be autodetected)
#' * A `HivePartitioning` or `HivePartitioningFactory`, as returned
#' by [hive_partition()] which parses explicit or autodetected fields from
#' Hive-style path segments
#' * `NULL` for no partitioning
#' @param ... Additional format-specific options, passed to
#' `FileFormat$create()`. For CSV options, note that you can specify them either
#' with the Arrow C++ library naming ("delimiter", "quoting", etc.) or the
#' `readr`-style naming used in [read_csv_arrow()] ("delim", "quote", etc.).
#' Not all `readr` options are currently supported; please file an issue if you
#' encounter one that `arrow` should support.
#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
#' in a list potentially with other `DatasetFactory` objects, to create
#' a `Dataset`.
#' @export
dataset_factory <- DatasetFactory$create
#' @usage NULL
#' @format NULL
#' @rdname Dataset
#' @export
FileSystemDatasetFactory <- R6Class("FileSystemDatasetFactory",
inherit = DatasetFactory
)
FileSystemDatasetFactory$create <- function(filesystem,
selector = NULL,
paths = NULL,
format,
partitioning = NULL) {
assert_is(filesystem, "FileSystem")
is.null(selector) || assert_is(selector, "FileSelector")
is.null(paths) || assert_is(paths, "character")
assert_that(
xor(is.null(selector), is.null(paths)),
msg = "Either selector or paths must be specified"
)
assert_is(format, "FileFormat")
if (!is.null(paths)) {
assert_that(is.null(partitioning), msg = "Partitioning not supported with paths")
}
if (!is.null(paths)) {
ptr <- dataset___FileSystemDatasetFactory__Make0(filesystem, paths, format)
} else if (is.null(partitioning)) {
ptr <- dataset___FileSystemDatasetFactory__Make1(filesystem, selector, format)
} else if (inherits(partitioning, "PartitioningFactory")) {
ptr <- dataset___FileSystemDatasetFactory__Make3(filesystem, selector, format, partitioning)
} else if (inherits(partitioning, "Partitioning")) {
ptr <- dataset___FileSystemDatasetFactory__Make2(filesystem, selector, format, partitioning)
} else {
stop(
"Expected 'partitioning' to be NULL, PartitioningFactory or Partitioning",
call. = FALSE
)
}
ptr
}