// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "arrow/testing/util.h"
#include <chrono>
#include <cstring>
#include <random>
#ifdef _WIN32
// clang-format off
// (prevent include reordering)
#include "arrow/util/windows_compatibility.h"
#include <winsock2.h>
// clang-format on
#include <arpa/inet.h> // IWYU pragma: keep
#include <netinet/in.h> // IWYU pragma: keep
#include <sys/socket.h> // IWYU pragma: keep
#include <sys/stat.h> // IWYU pragma: keep
#include <sys/types.h> // IWYU pragma: keep
#include <sys/wait.h> // IWYU pragma: keep
#include <unistd.h> // IWYU pragma: keep
#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/testing/random.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
namespace arrow {
uint64_t random_seed() {
return std::chrono::high_resolution_clock::now().time_since_epoch().count();
void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) {
const int random_seed = 0;
std::default_random_engine gen(random_seed);
std::uniform_real_distribution<double> d(0.0, 1.0);
std::generate(null_bytes, null_bytes + n,
[&d, &gen, &pct_null] { return d(gen) > pct_null; });
void random_is_valid(int64_t n, double pct_null, std::vector<bool>* is_valid,
int random_seed) {
std::default_random_engine gen(random_seed);
std::uniform_real_distribution<double> d(0.0, 1.0);
is_valid->resize(n, false);
std::generate(is_valid->begin(), is_valid->end(),
[&d, &gen, &pct_null] { return d(gen) > pct_null; });
void random_bytes(int64_t n, uint32_t seed, uint8_t* out) {
std::default_random_engine gen(seed);
std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
std::generate(out, out + n, [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
std::string random_string(int64_t n, uint32_t seed) {
std::string s;
random_bytes(n, seed, reinterpret_cast<uint8_t*>(&s[0]));
return s;
void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
std::default_random_engine gen(seed);
std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
const int32_t required_bytes = DecimalType::DecimalSize(precision);
constexpr int32_t byte_width = 16;
std::fill(out, out + byte_width * n, '\0');
for (int64_t i = 0; i < n; ++i, out += byte_width) {
std::generate(out, out + required_bytes,
[&d, &gen] { return static_cast<uint8_t>(d(gen)); });
// sign extend if the sign bit is set for the last byte generated
// 0b10000000 == 0x80 == 128
if ((out[required_bytes - 1] & '\x80') != 0) {
std::fill(out + required_bytes, out + byte_width, '\xFF');
void random_ascii(int64_t n, uint32_t seed, uint8_t* out) {
rand_uniform_int(n, seed, static_cast<int32_t>('A'), static_cast<int32_t>('z'), out);
int64_t CountNulls(const std::vector<uint8_t>& valid_bytes) {
return static_cast<int64_t>(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0'));
Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool,
std::shared_ptr<ResizableBuffer>* out, uint32_t seed) {
ARROW_ASSIGN_OR_RAISE(auto result, AllocateResizableBuffer(length, pool));
random_bytes(length, seed, result->mutable_data());
*out = std::move(result);
return Status::OK();
Status GetTestResourceRoot(std::string* out) {
const char* c_root = std::getenv("ARROW_TEST_DATA");
if (!c_root) {
return Status::IOError(
"Test resources not found, set ARROW_TEST_DATA to <repo root>/testing/data");
*out = std::string(c_root);
return Status::OK();
int GetListenPort() {
// Get a new available port number by binding a socket to an ephemeral port
// and then closing it. Since ephemeral port allocation tends to avoid
// reusing port numbers, this should give a different port number
// every time, even across processes.
struct sockaddr_in sin;
#ifdef _WIN32
SOCKET sock_fd;
auto sin_len = static_cast<int>(sizeof(sin));
auto errno_message = []() -> std::string {
return internal::WinErrorMessage(WSAGetLastError());
#define SOCKET_ERROR -1
int sock_fd;
auto sin_len = static_cast<socklen_t>(sizeof(sin));
auto errno_message = []() -> std::string { return internal::ErrnoMessage(errno); };
#ifdef _WIN32
WSADATA wsa_data;
if (WSAStartup(0x0202, &wsa_data) != 0) {
ARROW_LOG(FATAL) << "Failed to initialize Windows Sockets";
sock_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (sock_fd == INVALID_SOCKET) {
Status::IOError("Failed to create TCP socket: ", errno_message()).Abort();
// First bind to ('', 0)
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
if (bind(sock_fd, reinterpret_cast<struct sockaddr*>(&sin), sin_len) == SOCKET_ERROR) {
Status::IOError("bind() failed: ", errno_message()).Abort();
// Then get actual bound port number
if (getsockname(sock_fd, reinterpret_cast<struct sockaddr*>(&sin), &sin_len) ==
Status::IOError("getsockname() failed: ", errno_message()).Abort();
int port = ntohs(sin.sin_port);
#ifdef _WIN32
return port;
const std::vector<std::shared_ptr<DataType>>& all_dictionary_index_types() {
static std::vector<std::shared_ptr<DataType>> types = {
int8(), uint8(), int16(), uint16(), int32(), uint32(), int64(), uint64()};
return types;
} // namespace arrow