blob: 4ef472efb152fea3c520787187f36e1c62f40203 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Data types that connect Parquet physical types with their Rust-specific
//! representations.
use std::mem;
use byteorder::{BigEndian, ByteOrder};
use crate::basic::Type;
use crate::util::memory::{ByteBuffer, ByteBufferPtr};
/// Rust representation for logical type INT96, value is backed by an array of `u32`.
/// The type only takes 12 bytes, without extra padding.
#[derive(Clone, Debug)]
pub struct Int96 {
value: Option<[u32; 3]>,
}
impl Int96 {
/// Creates new INT96 type struct with no data set.
pub fn new() -> Self {
Self { value: None }
}
/// Returns underlying data as slice of [`u32`].
pub fn data(&self) -> &[u32] {
assert!(self.value.is_some());
self.value.as_ref().unwrap()
}
/// Sets data for this INT96 type.
pub fn set_data(&mut self, elem0: u32, elem1: u32, elem2: u32) {
self.value = Some([elem0, elem1, elem2]);
}
}
impl Default for Int96 {
fn default() -> Self {
Self { value: None }
}
}
impl PartialEq for Int96 {
fn eq(&self, other: &Int96) -> bool {
match (&self.value, &other.value) {
(Some(v1), Some(v2)) => v1 == v2,
(None, None) => true,
_ => false,
}
}
}
impl From<Vec<u32>> for Int96 {
fn from(buf: Vec<u32>) -> Self {
assert_eq!(buf.len(), 3);
let mut result = Self::new();
result.set_data(buf[0], buf[1], buf[2]);
result
}
}
/// Rust representation for BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY Parquet physical types.
/// Value is backed by a byte buffer.
#[derive(Clone, Debug)]
pub struct ByteArray {
data: Option<ByteBufferPtr>,
}
impl ByteArray {
/// Creates new byte array with no data set.
pub fn new() -> Self {
ByteArray { data: None }
}
/// Gets length of the underlying byte buffer.
pub fn len(&self) -> usize {
assert!(self.data.is_some());
self.data.as_ref().unwrap().len()
}
/// Returns slice of data.
pub fn data(&self) -> &[u8] {
assert!(self.data.is_some());
self.data.as_ref().unwrap().as_ref()
}
/// Set data from another byte buffer.
pub fn set_data(&mut self, data: ByteBufferPtr) {
self.data = Some(data);
}
/// Returns `ByteArray` instance with slice of values for a data.
pub fn slice(&self, start: usize, len: usize) -> Self {
assert!(self.data.is_some());
Self::from(self.data.as_ref().unwrap().range(start, len))
}
}
impl From<Vec<u8>> for ByteArray {
fn from(buf: Vec<u8>) -> ByteArray {
Self {
data: Some(ByteBufferPtr::new(buf)),
}
}
}
impl<'a> From<&'a str> for ByteArray {
fn from(s: &'a str) -> ByteArray {
let mut v = Vec::new();
v.extend_from_slice(s.as_bytes());
Self {
data: Some(ByteBufferPtr::new(v)),
}
}
}
impl From<ByteBufferPtr> for ByteArray {
fn from(ptr: ByteBufferPtr) -> ByteArray {
Self { data: Some(ptr) }
}
}
impl From<ByteBuffer> for ByteArray {
fn from(mut buf: ByteBuffer) -> ByteArray {
Self {
data: Some(buf.consume()),
}
}
}
impl Default for ByteArray {
fn default() -> Self {
ByteArray { data: None }
}
}
impl PartialEq for ByteArray {
fn eq(&self, other: &ByteArray) -> bool {
match (&self.data, &other.data) {
(Some(d1), Some(d2)) => d1.as_ref() == d2.as_ref(),
(None, None) => true,
_ => false,
}
}
}
/// Rust representation for Decimal values.
///
/// This is not a representation of Parquet physical type, but rather a wrapper for
/// DECIMAL logical type, and serves as container for raw parts of decimal values:
/// unscaled value in bytes, precision and scale.
#[derive(Clone, Debug)]
pub enum Decimal {
/// Decimal backed by `i32`.
Int32 {
value: [u8; 4],
precision: i32,
scale: i32,
},
/// Decimal backed by `i64`.
Int64 {
value: [u8; 8],
precision: i32,
scale: i32,
},
/// Decimal backed by byte array.
Bytes {
value: ByteArray,
precision: i32,
scale: i32,
},
}
impl Decimal {
/// Creates new decimal value from `i32`.
pub fn from_i32(value: i32, precision: i32, scale: i32) -> Self {
let mut bytes = [0; 4];
BigEndian::write_i32(&mut bytes, value);
Decimal::Int32 {
value: bytes,
precision,
scale,
}
}
/// Creates new decimal value from `i64`.
pub fn from_i64(value: i64, precision: i32, scale: i32) -> Self {
let mut bytes = [0; 8];
BigEndian::write_i64(&mut bytes, value);
Decimal::Int64 {
value: bytes,
precision,
scale,
}
}
/// Creates new decimal value from `ByteArray`.
pub fn from_bytes(value: ByteArray, precision: i32, scale: i32) -> Self {
Decimal::Bytes {
value,
precision,
scale,
}
}
/// Returns bytes of unscaled value.
pub fn data(&self) -> &[u8] {
match *self {
Decimal::Int32 { ref value, .. } => value,
Decimal::Int64 { ref value, .. } => value,
Decimal::Bytes { ref value, .. } => value.data(),
}
}
/// Returns decimal precision.
pub fn precision(&self) -> i32 {
match *self {
Decimal::Int32 { precision, .. } => precision,
Decimal::Int64 { precision, .. } => precision,
Decimal::Bytes { precision, .. } => precision,
}
}
/// Returns decimal scale.
pub fn scale(&self) -> i32 {
match *self {
Decimal::Int32 { scale, .. } => scale,
Decimal::Int64 { scale, .. } => scale,
Decimal::Bytes { scale, .. } => scale,
}
}
}
impl Default for Decimal {
fn default() -> Self {
Self::from_i32(0, 0, 0)
}
}
impl PartialEq for Decimal {
fn eq(&self, other: &Decimal) -> bool {
self.precision() == other.precision()
&& self.scale() == other.scale()
&& self.data() == other.data()
}
}
/// Converts an instance of data type to a slice of bytes as `u8`.
pub trait AsBytes {
/// Returns slice of bytes for this data type.
fn as_bytes(&self) -> &[u8];
}
macro_rules! gen_as_bytes {
($source_ty:ident) => {
impl AsBytes for $source_ty {
fn as_bytes(&self) -> &[u8] {
unsafe {
::std::slice::from_raw_parts(
self as *const $source_ty as *const u8,
::std::mem::size_of::<$source_ty>(),
)
}
}
}
};
}
gen_as_bytes!(bool);
gen_as_bytes!(u8);
gen_as_bytes!(i32);
gen_as_bytes!(u32);
gen_as_bytes!(i64);
gen_as_bytes!(f32);
gen_as_bytes!(f64);
impl AsBytes for Int96 {
fn as_bytes(&self) -> &[u8] {
unsafe {
::std::slice::from_raw_parts(self.data() as *const [u32] as *const u8, 12)
}
}
}
impl AsBytes for ByteArray {
fn as_bytes(&self) -> &[u8] {
self.data()
}
}
impl AsBytes for Decimal {
fn as_bytes(&self) -> &[u8] {
self.data()
}
}
impl AsBytes for Vec<u8> {
fn as_bytes(&self) -> &[u8] {
self.as_slice()
}
}
impl<'a> AsBytes for &'a str {
fn as_bytes(&self) -> &[u8] {
(self as &str).as_bytes()
}
}
impl AsBytes for str {
fn as_bytes(&self) -> &[u8] {
(self as &str).as_bytes()
}
}
/// Contains the Parquet physical type information as well as the Rust primitive type
/// presentation.
pub trait DataType: 'static {
type T: ::std::cmp::PartialEq
+ ::std::fmt::Debug
+ ::std::default::Default
+ ::std::clone::Clone
+ AsBytes;
/// Returns Parquet physical type.
fn get_physical_type() -> Type;
/// Returns size in bytes for Rust representation of the physical type.
fn get_type_size() -> usize;
}
macro_rules! make_type {
($name:ident, $physical_ty:path, $native_ty:ty, $size:expr) => {
pub struct $name {}
impl DataType for $name {
type T = $native_ty;
fn get_physical_type() -> Type {
$physical_ty
}
fn get_type_size() -> usize {
$size
}
}
};
}
// Generate struct definitions for all physical types
make_type!(BoolType, Type::BOOLEAN, bool, 1);
make_type!(Int32Type, Type::INT32, i32, 4);
make_type!(Int64Type, Type::INT64, i64, 8);
make_type!(Int96Type, Type::INT96, Int96, mem::size_of::<Int96>());
make_type!(FloatType, Type::FLOAT, f32, 4);
make_type!(DoubleType, Type::DOUBLE, f64, 8);
make_type!(
ByteArrayType,
Type::BYTE_ARRAY,
ByteArray,
mem::size_of::<ByteArray>()
);
make_type!(
FixedLenByteArrayType,
Type::FIXED_LEN_BYTE_ARRAY,
ByteArray,
mem::size_of::<ByteArray>()
);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_as_bytes() {
assert_eq!(false.as_bytes(), &[0]);
assert_eq!(true.as_bytes(), &[1]);
assert_eq!((7 as i32).as_bytes(), &[7, 0, 0, 0]);
assert_eq!((555 as i32).as_bytes(), &[43, 2, 0, 0]);
assert_eq!((555 as u32).as_bytes(), &[43, 2, 0, 0]);
assert_eq!(i32::max_value().as_bytes(), &[255, 255, 255, 127]);
assert_eq!(i32::min_value().as_bytes(), &[0, 0, 0, 128]);
assert_eq!((7 as i64).as_bytes(), &[7, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!((555 as i64).as_bytes(), &[43, 2, 0, 0, 0, 0, 0, 0]);
assert_eq!(
(i64::max_value()).as_bytes(),
&[255, 255, 255, 255, 255, 255, 255, 127]
);
assert_eq!((i64::min_value()).as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 128]);
assert_eq!((3.14 as f32).as_bytes(), &[195, 245, 72, 64]);
assert_eq!(
(3.14 as f64).as_bytes(),
&[31, 133, 235, 81, 184, 30, 9, 64]
);
assert_eq!("hello".as_bytes(), &[b'h', b'e', b'l', b'l', b'o']);
assert_eq!(
Vec::from("hello".as_bytes()).as_bytes(),
&[b'h', b'e', b'l', b'l', b'o']
);
// Test Int96
let i96 = Int96::from(vec![1, 2, 3]);
assert_eq!(i96.as_bytes(), &[1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]);
// Test ByteArray
let ba = ByteArray::from(vec![1, 2, 3]);
assert_eq!(ba.as_bytes(), &[1, 2, 3]);
// Test Decimal
let decimal = Decimal::from_i32(123, 5, 2);
assert_eq!(decimal.as_bytes(), &[0, 0, 0, 123]);
let decimal = Decimal::from_i64(123, 5, 2);
assert_eq!(decimal.as_bytes(), &[0, 0, 0, 0, 0, 0, 0, 123]);
let decimal = Decimal::from_bytes(ByteArray::from(vec![1, 2, 3]), 5, 2);
assert_eq!(decimal.as_bytes(), &[1, 2, 3]);
}
#[test]
fn test_int96_from() {
assert_eq!(
Int96::from(vec![1, 12345, 1234567890]).data(),
&[1, 12345, 1234567890]
);
}
#[test]
fn test_byte_array_from() {
assert_eq!(
ByteArray::from(vec![b'A', b'B', b'C']).data(),
&[b'A', b'B', b'C']
);
assert_eq!(ByteArray::from("ABC").data(), &[b'A', b'B', b'C']);
assert_eq!(
ByteArray::from(ByteBufferPtr::new(vec![1u8, 2u8, 3u8, 4u8, 5u8])).data(),
&[1u8, 2u8, 3u8, 4u8, 5u8]
);
let mut buf = ByteBuffer::new();
buf.set_data(vec![6u8, 7u8, 8u8, 9u8, 10u8]);
assert_eq!(ByteArray::from(buf).data(), &[6u8, 7u8, 8u8, 9u8, 10u8]);
}
#[test]
fn test_decimal_partial_eq() {
assert_eq!(Decimal::default(), Decimal::from_i32(0, 0, 0));
assert_eq!(Decimal::from_i32(222, 5, 2), Decimal::from_i32(222, 5, 2));
assert_eq!(
Decimal::from_bytes(ByteArray::from(vec![0, 0, 0, 3]), 5, 2),
Decimal::from_i32(3, 5, 2)
);
assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(111, 5, 2));
assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 6, 2));
assert!(Decimal::from_i32(222, 5, 2) != Decimal::from_i32(222, 5, 3));
assert!(Decimal::from_i64(222, 5, 2) != Decimal::from_i32(222, 5, 2));
}
}