blob: 8a70f4c5b7bfca38aeb878e581236431ed42f6a5 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' @importFrom utils getFromNamespace
glimpse.ArrowTabular <- function(x,
width = getOption("pillar.width", getOption("width")),
...) {
# This function is inspired by pillar:::glimpse.tbl(), with some adaptations
# We use cli:: and pillar:: throughout this function. We don't need to check
# to see if they're installed because dplyr depends on pillar, which depends
# on cli, and we're only in this function though S3 dispatch on dplyr::glimpse
if (!is.finite(width)) {
abort("`width` must be finite.")
}
# We need a couple of internal functions in pillar for formatting
pretty_int <- getFromNamespace("big_mark", "pillar")
make_shorter <- getFromNamespace("str_trunc", "pillar")
tickify <- getFromNamespace("tick_if_needed", "pillar")
# Even though this is the ArrowTabular method, we use it for arrow_dplyr_query
# so make some accommodations. (Others are handled by S3 method dispatch.)
if (inherits(x, "arrow_dplyr_query")) {
class_title <- paste(source_data(x)$class_title(), "(query)")
} else {
class_title <- x$class_title()
}
cli::cat_line(class_title)
dims <- dim(x)
cli::cat_line(sprintf(
"%s rows x %s columns", pretty_int(dims[1]), pretty_int(dims[2])
))
if (dims[2] == 0) {
return(invisible(x))
}
nrows <- as.integer(width / 3)
head_tab <- dplyr::compute(head(x, nrows))
# Take the schema from this Table because if x is arrow_dplyr_query, some
# output types could be a best guess (in implicit_schema()).
schema <- head_tab$schema
# Assemble the column names and types
# We use the Arrow type names here. See type_sum.DataType() below.
var_types <- map_chr(schema$fields, ~ format(pillar::new_pillar_type(.$type)))
# glimpse.tbl() left-aligns the var names (pads with whitespace to the right)
# and appends the types next to them. Because those type names are
# aggressively truncated to all be roughly the same length, this means the
# data glimpse that follows is also mostly aligned.
# However, Arrow type names are longer and variable length, and we're only
# truncating the nested type information inside of <...>. So, to keep the
# data glimpses aligned, we "justify" align the name and type: add the padding
# whitespace between them so that the total width is equal.
var_headings <- paste("$", center_pad(tickify(names(x)), var_types))
# Assemble the data glimpse
df <- as.data.frame(head_tab)
formatted_data <- map_chr(df, function(.) {
tryCatch(
paste(pillar::format_glimpse(.), collapse = ", "),
# This could error e.g. if you have a VctrsExtensionType and the package
# that defines methods for the data is not loaded
error = function(e) conditionMessage(e)
)
})
# Here and elsewhere in the glimpse code, you have to use pillar::get_extent()
# instead of nchar() because get_extent knows how to deal with ANSI escapes
# etc.--it counts how much space on the terminal will be taken when printed.
data_width <- width - pillar::get_extent(var_headings)
truncated_data <- make_shorter(formatted_data, data_width)
# Print the table body (var name, type, data glimpse)
cli::cat_line(var_headings, " ", truncated_data)
# TODO: use crayon to style these footers?
if (inherits(x, "arrow_dplyr_query")) {
cli::cat_line("Call `print()` for query details")
} else if (any(grepl("<...>", var_types, fixed = TRUE)) || schema$HasMetadata) {
cli::cat_line("Call `print()` for full schema details")
}
invisible(x)
}
# Dataset has an efficient head() method via Scanner so this is fine
glimpse.Dataset <- glimpse.ArrowTabular
glimpse.arrow_dplyr_query <- function(x,
width = getOption("pillar.width", getOption("width")),
...) {
if (any(map_lgl(all_sources(x), ~ inherits(., "RecordBatchReader")))) {
msg <- paste(
"Cannot glimpse() data from a RecordBatchReader because it can only be",
"read one time. Call `compute()` to evaluate the query first."
)
message(msg)
print(x)
} else if (query_on_dataset(x) && !query_can_stream(x)) {
msg <- paste(
"This query requires a full table scan, so glimpse() may be",
"expensive. Call `compute()` to evaluate the query first."
)
message(msg)
print(x)
} else {
# Go for it
glimpse.ArrowTabular(x, width = width, ...)
}
}
glimpse.RecordBatchReader <- function(x,
width = getOption("pillar.width", getOption("width")),
...) {
# TODO(ARROW-17038): to_arrow() on duckdb con should hold con not RBR so it
# can be run more than once (like duckdb does on the other side)
msg <- paste(
"Cannot glimpse() data from a RecordBatchReader because it can only be",
"read one time; call `as_arrow_table()` to consume it first."
)
message(msg)
print(x)
}
glimpse.ArrowDatum <- function(x, width, ...) {
cli::cat_line(gsub("[ \n]+", " ", x$ToString()))
invisible(x)
}
type_sum.DataType <- function(x) {
if (inherits(x, "VctrsExtensionType")) {
# ptype() holds a vctrs type object, which pillar knows how to format
paste0("ext<", pillar::type_sum(x$ptype()), ">")
} else {
# Trim long type names with <...>
sub("<.*>", "<...>", x$ToString())
}
}
center_pad <- function(left, right) {
left_sizes <- pillar::get_extent(left)
right_sizes <- pillar::get_extent(right)
total_width <- max(left_sizes + right_sizes) + 1L
paste0(left, strrep(" ", total_width - left_sizes - right_sizes), right)
}