blob: 0fc9d30ab6e346dc14d217fdf0ef44556458944f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! The central type in Apache Arrow are arrays, which are a known-length sequence of values
//! all having the same type. This crate provides concrete implementations of each type, as
//! well as an [`Array`] trait that can be used for type-erasure.
//!
//! # Building an Array
//!
//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`]
//!
//! ```
//! # use arrow_array::{Int32Array, ListArray, StringArray};
//! # use arrow_array::types::Int32Type;
//! #
//! Int32Array::from(vec![1, 2]);
//! Int32Array::from(vec![Some(1), None]);
//! Int32Array::from_iter([1, 2, 3, 4]);
//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]);
//!
//! StringArray::from(vec!["foo", "bar"]);
//! StringArray::from(vec![Some("foo"), None]);
//! StringArray::from_iter([Some("foo"), None]);
//! StringArray::from_iter_values(["foo", "bar"]);
//!
//! ListArray::from_iter_primitive::<Int32Type, _, _>([
//! Some(vec![Some(1), None, Some(3)]),
//! None,
//! Some(vec![])
//! ]);
//! ```
//!
//! Additionally [`ArrayBuilder`](builder::ArrayBuilder) implementations can be
//! used to construct arrays with a push-based interface
//!
//! ```
//! # use arrow_array::Int16Array;
//! #
//! // Create a new builder with a capacity of 100
//! let mut builder = Int16Array::builder(100);
//!
//! // Append a single primitive value
//! builder.append_value(1);
//! // Append a null value
//! builder.append_null();
//! // Append a slice of primitive values
//! builder.append_slice(&[2, 3, 4]);
//!
//! // Build the array
//! let array = builder.finish();
//!
//! assert_eq!(5, array.len());
//! assert_eq!(2, array.value(2));
//! assert_eq!(&array.values()[3..5], &[3, 4])
//! ```
//!
//! # Low-level API
//!
//! Internally, arrays consist of one or more shared memory regions backed by a [`Buffer`],
//! the number and meaning of which depend on the array’s data type, as documented in
//! the [Arrow specification].
//!
//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of:
//!
//! * An optional [`NullBuffer`] identifying any null values
//! * A contiguous [`ScalarBuffer<i16>`] of values
//!
//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of:
//!
//! * An optional [`NullBuffer`] identifying any null values
//! * An offsets [`OffsetBuffer<i32>`] identifying valid UTF-8 sequences within the values buffer
//! * A values [`Buffer`] of UTF-8 encoded string data
//!
//! Array constructors such as [`PrimitiveArray::try_new`] provide the ability to cheaply
//! construct an array from these parts, with functions such as [`PrimitiveArray::into_parts`]
//! providing the reverse operation.
//!
//! ```
//! # use arrow_array::{Array, Int32Array, StringArray};
//! # use arrow_buffer::OffsetBuffer;
//! #
//! // Create a Int32Array from Vec without copying
//! let array = Int32Array::new(vec![1, 2, 3].into(), None);
//! assert_eq!(array.values(), &[1, 2, 3]);
//! assert_eq!(array.null_count(), 0);
//!
//! // Create a StringArray from parts
//! let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
//! let array = StringArray::new(offsets, b"helloworld".into(), None);
//! let values: Vec<_> = array.iter().map(|x| x.unwrap()).collect();
//! assert_eq!(values, &["hello", "world"]);
//! ```
//!
//! As [`Buffer`], and its derivatives, can be created from [`Vec`] without copying, this provides
//! an efficient way to not only interoperate with other Rust code, but also implement kernels
//! optimised for the arrow data layout - e.g. by handling buffers instead of values.
//!
//! # Zero-Copy Slicing
//!
//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this
//! data. Internally this just increments some ref-counts, and so is incredibly cheap
//!
//! ```rust
//! # use arrow_array::Int32Array;
//! let array = Int32Array::from_iter([1, 2, 3]);
//!
//! // Slice with offset 1 and length 2
//! let sliced = array.slice(1, 2);
//! assert_eq!(sliced.values(), &[2, 3]);
//! ```
//!
//! # Downcasting an Array
//!
//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`].
//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`].
//!
//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs,
//! it is often the case that you wish to interact with the concrete arrays directly.
//!
//! This requires downcasting to the concrete type of the array:
//!
//! ```
//! # use arrow_array::{Array, Float32Array, Int32Array};
//!
//! // Safely downcast an `Array` to an `Int32Array` and compute the sum
//! // using native i32 values
//! fn sum_int32(array: &dyn Array) -> i32 {
//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap();
//! integers.iter().map(|val| val.unwrap_or_default()).sum()
//! }
//!
//! // Safely downcasts the array to a `Float32Array` and returns a &[f32] view of the data
//! // Note: the values for positions corresponding to nulls will be arbitrary (but still valid f32)
//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
//! array.as_any().downcast_ref::<Float32Array>().unwrap().values()
//! }
//! ```
//!
//! The [`cast::AsArray`] extension trait can make this more ergonomic
//!
//! ```
//! # use arrow_array::Array;
//! # use arrow_array::cast::{AsArray, as_primitive_array};
//! # use arrow_array::types::Float32Type;
//!
//! fn as_f32_slice(array: &dyn Array) -> &[f32] {
//! array.as_primitive::<Float32Type>().values()
//! }
//! ```
//! # Alternatives to ChunkedArray Support
//!
//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python
//! and C++ Arrow implementations. The recommended alternative is to use one of the following:
//! - `Vec<ArrayRef>` a simple, eager version of a `ChunkedArray`
//! - `impl Iterator<Item=ArrayRef>` a lazy version of a `ChunkedArray`
//! - `impl Stream<Item=ArrayRef>` a lazy async version of a `ChunkedArray`
//!
//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes
//! extensive use of [RecordBatchStream].
//!
//! This approach integrates well into the Rust ecosystem, simplifies the implementation and
//! encourages the use of performant lazy and async patterns.
//! ```rust
//! use std::sync::Arc;
//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray};
//! use arrow_array::cast::AsArray;
//! use arrow_array::types::Float32Type;
//! use arrow_schema::DataType;
//!
//! let batches = [
//! RecordBatch::try_from_iter(vec![
//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef),
//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef),
//! ]).unwrap(),
//! RecordBatch::try_from_iter(vec![
//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef),
//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef),
//! ]).unwrap(),
//! ];
//!
//! let labels: Vec<&str> = batches
//! .iter()
//! .flat_map(|batch| batch.column(0).as_string::<i32>())
//! .map(Option::unwrap)
//! .collect();
//!
//! let values: Vec<f32> = batches
//! .iter()
//! .flat_map(|batch| batch.column(1).as_primitive::<Float32Type>().values())
//! .copied()
//! .collect();
//!
//! assert_eq!(labels, ["A", "B", "C", "D", "E"]);
//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]);
//!```
//! [`ScalarBuffer<T>`]: arrow_buffer::ScalarBuffer
//! [`ScalarBuffer<i16>`]: arrow_buffer::ScalarBuffer
//! [`OffsetBuffer<i32>`]: arrow_buffer::OffsetBuffer
//! [`NullBuffer`]: arrow_buffer::NullBuffer
//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html
//! [`&dyn Array`]: Array
//! [`NullBuffer`]: arrow_buffer::NullBuffer
//! [`Buffer`]: arrow_buffer::Buffer
//! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
//! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html
//! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html
//! [DataFusion]: https://github.com/apache/arrow-datafusion
//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html
#![deny(rustdoc::broken_intra_doc_links)]
#![warn(missing_docs)]
pub mod array;
pub use array::*;
mod record_batch;
pub use record_batch::{
RecordBatch, RecordBatchIterator, RecordBatchOptions, RecordBatchReader, RecordBatchWriter,
};
mod arithmetic;
pub use arithmetic::ArrowNativeTypeOp;
mod numeric;
pub use numeric::*;
mod scalar;
pub use scalar::*;
pub mod builder;
pub mod cast;
mod delta;
#[cfg(feature = "ffi")]
pub mod ffi;
#[cfg(feature = "ffi")]
pub mod ffi_stream;
pub mod iterator;
pub mod run_iterator;
pub mod temporal_conversions;
pub mod timezone;
mod trusted_len;
pub mod types;
#[cfg(test)]
mod tests {
use crate::builder::*;
#[test]
fn test_buffer_builder_availability() {
let _builder = Int8BufferBuilder::new(10);
let _builder = Int16BufferBuilder::new(10);
let _builder = Int32BufferBuilder::new(10);
let _builder = Int64BufferBuilder::new(10);
let _builder = UInt16BufferBuilder::new(10);
let _builder = UInt32BufferBuilder::new(10);
let _builder = Float32BufferBuilder::new(10);
let _builder = Float64BufferBuilder::new(10);
let _builder = TimestampSecondBufferBuilder::new(10);
let _builder = TimestampMillisecondBufferBuilder::new(10);
let _builder = TimestampMicrosecondBufferBuilder::new(10);
let _builder = TimestampNanosecondBufferBuilder::new(10);
let _builder = Date32BufferBuilder::new(10);
let _builder = Date64BufferBuilder::new(10);
let _builder = Time32SecondBufferBuilder::new(10);
let _builder = Time32MillisecondBufferBuilder::new(10);
let _builder = Time64MicrosecondBufferBuilder::new(10);
let _builder = Time64NanosecondBufferBuilder::new(10);
let _builder = IntervalYearMonthBufferBuilder::new(10);
let _builder = IntervalDayTimeBufferBuilder::new(10);
let _builder = IntervalMonthDayNanoBufferBuilder::new(10);
let _builder = DurationSecondBufferBuilder::new(10);
let _builder = DurationMillisecondBufferBuilder::new(10);
let _builder = DurationMicrosecondBufferBuilder::new(10);
let _builder = DurationNanosecondBufferBuilder::new(10);
}
}