blob: 8380fa2af989cf591b4bca207b11bbd8a3a362da [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#' Install or upgrade the Arrow library
#'
#' Use this function to install the latest release of `arrow`, to switch to or
#' from a nightly development version, or on Linux to try reinstalling with
#' all necessary C++ dependencies.
#'
#' Note that, unlike packages like `tensorflow`, `blogdown`, and others that
#' require external dependencies, you do not need to run `install_arrow()`
#' after a successful `arrow` installation.
#'
#' @param nightly logical: Should we install a development version of the
#' package, or should we install from CRAN (the default).
#' @param binary On Linux, value to set for the environment variable
#' `LIBARROW_BINARY`, which governs how C++ binaries are used, if at all.
#' The default value, `TRUE`, tells the installation script to detect the
#' Linux distribution and version and find an appropriate C++ library. `FALSE`
#' would tell the script not to retrieve a binary and instead build Arrow C++
#' from source. Other valid values are strings corresponding to a Linux
#' distribution-version, to override the value that would be detected. See the
#' \href{https://arrow.apache.org/docs/r/articles/install.html}{install guide}
#' for further details.
#' @param use_system logical: Should we use `pkg-config` to look for Arrow
#' system packages? Default is `FALSE`. If `TRUE`, source installation may be
#' faster, but there is a risk of version mismatch. This sets the
#' `ARROW_USE_PKG_CONFIG` environment variable.
#' @param minimal logical: If building from source, should we build without
#' optional dependencies (compression libraries, for example)? Default is
#' `FALSE`. This sets the `LIBARROW_MINIMAL` environment variable.
#' @param verbose logical: Print more debugging output when installing? Default
#' is `FALSE`. This sets the `ARROW_R_DEV` environment variable.
#' @param repos character vector of base URLs of the repositories to install
#' from (passed to `install.packages()`)
#' @param ... Additional arguments passed to `install.packages()`
#' @export
#' @importFrom utils install.packages
#' @seealso [arrow_info()] to see if the package was configured with
#' necessary C++ dependencies.
#' \href{https://arrow.apache.org/docs/r/articles/install.html}{install guide}
#' for more ways to tune installation on Linux.
install_arrow <- function(nightly = FALSE,
binary = Sys.getenv("LIBARROW_BINARY", TRUE),
use_system = Sys.getenv("ARROW_USE_PKG_CONFIG", FALSE),
minimal = Sys.getenv("LIBARROW_MINIMAL", FALSE),
verbose = Sys.getenv("ARROW_R_DEV", FALSE),
repos = getOption("repos"),
...) {
sysname <- tolower(Sys.info()[["sysname"]])
conda <- isTRUE(grepl("conda", R.Version()$platform))
if (conda) {
if (nightly) {
system("conda install -y -c arrow-nightlies -c conda-forge --strict-channel-priority r-arrow")
} else {
system("conda install -y -c conda-forge --strict-channel-priority r-arrow")
}
} else {
Sys.setenv(
LIBARROW_BINARY = binary,
LIBARROW_MINIMAL = minimal,
ARROW_R_DEV = verbose,
ARROW_USE_PKG_CONFIG = use_system
)
# On the M1, we can't use the usual autobrew, which pulls Intel dependencies
apple_m1 <- grepl("arm-apple|aarch64.*darwin", R.Version()$platform)
# On Rosetta, we have to build without JEMALLOC, so we also can't autobrew
rosetta <- identical(sysname, "darwin") && identical(system("sysctl -n sysctl.proc_translated", intern = TRUE), "1")
if (rosetta) {
Sys.setenv(ARROW_JEMALLOC = "OFF")
}
if (apple_m1 || rosetta) {
Sys.setenv(FORCE_BUNDLED_BUILD = "true")
}
opts <- list()
if (apple_m1 || rosetta) {
# Skip binaries (esp. for rosetta)
opts$pkgType <- "source"
} else if (isTRUE(binary)) {
# Unless otherwise directed, don't consider newer source packages when
# options(pkgType) == "both" (default on win/mac)
opts$install.packages.check.source <- "no"
opts$install.packages.compile.from.source <- "never"
}
if (length(opts)) {
old <- options(opts)
on.exit(options(old))
}
install.packages("arrow", repos = arrow_repos(repos, nightly), ...)
}
if ("arrow" %in% loadedNamespaces()) {
# If you've just sourced this file, "arrow" won't be (re)loaded
reload_arrow()
}
}
arrow_repos <- function(repos = getOption("repos"), nightly = FALSE) {
if (length(repos) == 0 || identical(repos, c(CRAN = "@CRAN@"))) {
# Set the default/CDN
repos <- "https://cloud.r-project.org/"
}
dev_repo <- getOption("arrow.dev_repo", "https://nightlies.apache.org/arrow/r")
# Remove it if it's there (so nightly=FALSE won't accidentally pull from it)
repos <- setdiff(repos, dev_repo)
if (nightly) {
# Add it first
repos <- c(dev_repo, repos)
}
repos
}
reload_arrow <- function() {
if (requireNamespace("pkgload", quietly = TRUE)) {
is_attached <- "package:arrow" %in% search()
pkgload::unload("arrow")
if (is_attached) {
require("arrow", character.only = TRUE, quietly = TRUE)
} else {
requireNamespace("arrow", quietly = TRUE)
}
} else {
message("Please restart R to use the 'arrow' package.")
}
}
#' Create a source bundle that includes all thirdparty dependencies
#'
#' @param dest_file File path for the new tar.gz package. Defaults to
#' `arrow_V.V.V_with_deps.tar.gz` in the current directory (`V.V.V` is the version)
#' @param source_file File path for the input tar.gz package. Defaults to
#' downloading the package from CRAN (or whatever you have set as the first in
#' `getOption("repos")`)
#' @return The full path to `dest_file`, invisibly
#'
#' This function is used for setting up an offline build. If it's possible to
#' download at build time, don't use this function. Instead, let `cmake`
#' download the required dependencies for you.
#' These downloaded dependencies are only used in the build if
#' `ARROW_DEPENDENCY_SOURCE` is unset, `BUNDLED`, or `AUTO`.
#' https://arrow.apache.org/docs/developers/cpp/building.html#offline-builds
#'
#' If you're using binary packages you shouldn't need to use this function. You
#' should download the appropriate binary from your package repository, transfer
#' that to the offline computer, and install that. Any OS can create the source
#' bundle, but it cannot be installed on Windows. (Instead, use a standard
#' Windows binary package.)
#'
#' Note if you're using RStudio Package Manager on Linux: If you still want to
#' make a source bundle with this function, make sure to set the first repo in
#' `options("repos")` to be a mirror that contains source packages (that is:
#' something other than the RSPM binary mirror URLs).
#'
#' ## Steps for an offline install with optional dependencies:
#'
#' ### Using a computer with internet access, pre-download the dependencies:
#' * Install the `arrow` package _or_ run
#' `source("https://raw.githubusercontent.com/apache/arrow/main/r/R/install-arrow.R")`
#' * Run `create_package_with_all_dependencies("my_arrow_pkg.tar.gz")`
#' * Copy the newly created `my_arrow_pkg.tar.gz` to the computer without internet access
#'
#' ### On the computer without internet access, install the prepared package:
#' * Install the `arrow` package from the copied file
#' * `install.packages("my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo"))`
#' * This installation will build from source, so `cmake` must be available
#' * Run [arrow_info()] to check installed capabilities
#'
#'
#' @examples
#' \dontrun{
#' new_pkg <- create_package_with_all_dependencies()
#' # Note: this works when run in the same R session, but it's meant to be
#' # copied to a different computer.
#' install.packages(new_pkg, dependencies = c("Depends", "Imports", "LinkingTo"))
#' }
#' @export
create_package_with_all_dependencies <- function(dest_file = NULL, source_file = NULL) {
if (Sys.which("bash") == "") {
stop("
This function requires bash to be installed and available in your PATH.
If using RTools, it may be useful to run this code as:
pkgbuild::with_build_tools(create_package_with_all_dependencies())
")
}
if (is.null(source_file)) {
pkg_download_dir <- tempfile()
dir.create(pkg_download_dir)
on.exit(unlink(pkg_download_dir, recursive = TRUE), add = TRUE)
message("Downloading Arrow source file")
downloaded <- utils::download.packages("arrow", destdir = pkg_download_dir, type = "source")
source_file <- downloaded[1, 2, drop = TRUE]
}
if (!file.exists(source_file) || !endsWith(source_file, "tar.gz")) {
stop("Arrow package .tar.gz file not found")
}
if (is.null(dest_file)) {
# e.g. convert /path/to/arrow_5.0.0.tar.gz to ./arrow_5.0.0_with_deps.tar.gz
# (add 'with_deps' for clarity if the file was downloaded locally)
dest_file <- paste0(gsub(".tar.gz$", "", basename(source_file)), "_with_deps.tar.gz")
}
untar_dir <- tempfile()
on.exit(unlink(untar_dir, recursive = TRUE), add = TRUE)
utils::untar(source_file, exdir = untar_dir)
tools_dir <- file.path(untar_dir, "arrow/tools")
download_dependencies_sh <- file.path(tools_dir, "download_dependencies_R.sh")
# If you change this path, also need to edit nixlibs.R
download_dir <- file.path(tools_dir, "thirdparty_dependencies")
dir.create(download_dir)
download_script <- tempfile(fileext = ".R")
parse_versions_success <- system2(
"bash", c(download_dependencies_sh, download_dir),
stdout = download_script,
stderr = FALSE
) == 0
if (!parse_versions_success) {
stop("Failed to parse versions.txt")
}
# `source` the download_script to use R to download all the dependency bundles
source(download_script)
# Need to change directory to untar_dir so tar() will use relative paths. That
# means we'll need a full, non-relative path for dest_file. (extra_flags="-C"
# doesn't work with R's internal tar)
orig_wd <- getwd()
on.exit(setwd(orig_wd), add = TRUE)
# normalizePath() may return the input unchanged if dest_file doesn't exist,
# so create it first.
file.create(dest_file)
dest_file <- normalizePath(dest_file, mustWork = TRUE)
setwd(untar_dir)
message("Repacking tar.gz file to ", dest_file)
tar_successful <- utils::tar(dest_file, compression = "gz") == 0
if (!tar_successful) {
stop("Failed to create new tar.gz file")
}
invisible(dest_file)
}