blob: 175b50b01772c532bc5f7b95d56c8f8ef92dc23f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines the logical data types of Arrow arrays.
//!
//! The most important things you might be looking for are:
//! * [`Schema`](crate::datatypes::Schema) to describe a schema.
//! * [`Field`](crate::datatypes::Field) to describe one field within a schema.
//! * [`DataType`](crate::datatypes::DataType) to describe the type of a field.
use std::sync::Arc;
mod native;
pub use native::*;
mod field;
pub use field::*;
mod schema;
pub use schema::*;
mod numeric;
pub use numeric::*;
mod types;
pub use types::*;
mod datatype;
pub use datatype::*;
/// A reference-counted reference to a [`Schema`](crate::datatypes::Schema).
pub type SchemaRef = Arc<Schema>;
#[cfg(test)]
mod tests {
use super::*;
use crate::error::Result;
use serde_json::Value::{Bool, Number as VNumber};
use serde_json::{Number, Value};
use std::{
collections::{BTreeMap, HashMap},
f32::NAN,
};
#[test]
fn test_list_datatype_equality() {
// tests that list type equality is checked while ignoring list names
let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true)));
let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true)));
assert!(list_a.equals_datatype(&list_b));
assert!(!list_a.equals_datatype(&list_c));
assert!(!list_b.equals_datatype(&list_c));
assert!(!list_a.equals_datatype(&list_d));
let list_e =
DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3);
let list_f =
DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3);
let list_g = DataType::FixedSizeList(
Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)),
3,
);
assert!(list_e.equals_datatype(&list_f));
assert!(!list_e.equals_datatype(&list_g));
assert!(!list_f.equals_datatype(&list_g));
let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]);
let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]);
let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]);
let list_k = DataType::Struct(vec![
Field::new("f1", list_f.clone(), false),
Field::new("f2", list_g.clone(), false),
Field::new("f3", DataType::Utf8, true),
]);
let list_l = DataType::Struct(vec![
Field::new("ff1", list_f.clone(), false),
Field::new("ff2", list_g.clone(), false),
Field::new("ff3", DataType::LargeUtf8, true),
]);
let list_m = DataType::Struct(vec![
Field::new("ff1", list_f, false),
Field::new("ff2", list_g, false),
Field::new("ff3", DataType::Utf8, true),
]);
assert!(list_h.equals_datatype(&list_i));
assert!(!list_h.equals_datatype(&list_j));
assert!(!list_k.equals_datatype(&list_l));
assert!(list_k.equals_datatype(&list_m));
}
#[test]
fn create_struct_type() {
let _person = DataType::Struct(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
}
#[test]
fn serde_struct_type() {
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
// Non-empty map: should be converted as JSON obj { ... }
let mut first_name = Field::new("first_name", DataType::Utf8, false);
first_name.set_metadata(Some(field_metadata));
// Empty map: should be omitted.
let mut last_name = Field::new("last_name", DataType::Utf8, false);
last_name.set_metadata(Some(BTreeMap::default()));
let person = DataType::Struct(vec![
first_name,
last_name,
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
let serialized = serde_json::to_string(&person).unwrap();
// NOTE that this is testing the default (derived) serialization format, not the
// JSON format specified in metadata.md
assert_eq!(
"{\"Struct\":[\
{\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\
{\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\
{\"name\":\"address\",\"data_type\":{\"Struct\":\
[{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\
{\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\
]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}",
serialized
);
let deserialized = serde_json::from_str(&serialized).unwrap();
assert_eq!(person, deserialized);
}
#[test]
fn struct_field_to_json() {
let f = Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
);
let value: Value = serde_json::from_str(
r#"{
"name": "address",
"nullable": false,
"type": {
"name": "struct"
},
"children": [
{
"name": "street",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
},
{
"name": "zip",
"nullable": false,
"type": {
"name": "int",
"bitWidth": 16,
"isSigned": false
},
"children": []
}
]
}"#,
)
.unwrap();
assert_eq!(value, f.to_json());
}
#[test]
fn primitive_field_to_json() {
let f = Field::new("first_name", DataType::Utf8, false);
let value: Value = serde_json::from_str(
r#"{
"name": "first_name",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
}"#,
)
.unwrap();
assert_eq!(value, f.to_json());
}
#[test]
fn parse_struct_from_json() {
let json = r#"
{
"name": "address",
"type": {
"name": "struct"
},
"nullable": false,
"children": [
{
"name": "street",
"type": {
"name": "utf8"
},
"nullable": false,
"children": []
},
{
"name": "zip",
"type": {
"name": "int",
"isSigned": false,
"bitWidth": 16
},
"nullable": false,
"children": []
}
]
}
"#;
let value: Value = serde_json::from_str(json).unwrap();
let dt = Field::from(&value).unwrap();
let expected = Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
);
assert_eq!(expected, dt);
}
#[test]
fn parse_utf8_from_json() {
let json = "{\"name\":\"utf8\"}";
let value: Value = serde_json::from_str(json).unwrap();
let dt = DataType::from(&value).unwrap();
assert_eq!(DataType::Utf8, dt);
}
#[test]
fn parse_int32_from_json() {
let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}";
let value: Value = serde_json::from_str(json).unwrap();
let dt = DataType::from(&value).unwrap();
assert_eq!(DataType::Int32, dt);
}
#[test]
fn schema_json() {
// Add some custom metadata
let metadata: HashMap<String, String> =
[("Key".to_string(), "Value".to_string())]
.iter()
.cloned()
.collect();
let schema = Schema::new_with_metadata(
vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Binary, false),
Field::new("c3", DataType::FixedSizeBinary(3), false),
Field::new("c4", DataType::Boolean, false),
Field::new("c5", DataType::Date32, false),
Field::new("c6", DataType::Date64, false),
Field::new("c7", DataType::Time32(TimeUnit::Second), false),
Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false),
Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false),
Field::new("c11", DataType::Time64(TimeUnit::Second), false),
Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false),
Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false),
Field::new(
"c16",
DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())),
false,
),
Field::new(
"c17",
DataType::Timestamp(
TimeUnit::Microsecond,
Some("Africa/Johannesburg".to_string()),
),
false,
),
Field::new(
"c18",
DataType::Timestamp(TimeUnit::Nanosecond, None),
false,
),
Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
Field::new(
"c21",
DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
false,
),
Field::new(
"c22",
DataType::FixedSizeList(
Box::new(Field::new("bools", DataType::Boolean, false)),
5,
),
false,
),
Field::new(
"c23",
DataType::List(Box::new(Field::new(
"inner_list",
DataType::List(Box::new(Field::new(
"struct",
DataType::Struct(vec![]),
true,
))),
false,
))),
true,
),
Field::new(
"c24",
DataType::Struct(vec![
Field::new("a", DataType::Utf8, false),
Field::new("b", DataType::UInt16, false),
]),
false,
),
Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true),
Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true),
Field::new("c27", DataType::Duration(TimeUnit::Second), false),
Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false),
Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false),
Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false),
Field::new_dict(
"c31",
DataType::Dictionary(
Box::new(DataType::Int32),
Box::new(DataType::Utf8),
),
true,
123,
true,
),
Field::new("c32", DataType::LargeBinary, true),
Field::new("c33", DataType::LargeUtf8, true),
Field::new(
"c34",
DataType::LargeList(Box::new(Field::new(
"inner_large_list",
DataType::LargeList(Box::new(Field::new(
"struct",
DataType::Struct(vec![]),
false,
))),
true,
))),
true,
),
],
metadata,
);
let expected = schema.to_json();
let json = r#"{
"fields": [
{
"name": "c1",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
},
{
"name": "c2",
"nullable": false,
"type": {
"name": "binary"
},
"children": []
},
{
"name": "c3",
"nullable": false,
"type": {
"name": "fixedsizebinary",
"byteWidth": 3
},
"children": []
},
{
"name": "c4",
"nullable": false,
"type": {
"name": "bool"
},
"children": []
},
{
"name": "c5",
"nullable": false,
"type": {
"name": "date",
"unit": "DAY"
},
"children": []
},
{
"name": "c6",
"nullable": false,
"type": {
"name": "date",
"unit": "MILLISECOND"
},
"children": []
},
{
"name": "c7",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 32,
"unit": "SECOND"
},
"children": []
},
{
"name": "c8",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 32,
"unit": "MILLISECOND"
},
"children": []
},
{
"name": "c9",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 32,
"unit": "MICROSECOND"
},
"children": []
},
{
"name": "c10",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 32,
"unit": "NANOSECOND"
},
"children": []
},
{
"name": "c11",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 64,
"unit": "SECOND"
},
"children": []
},
{
"name": "c12",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 64,
"unit": "MILLISECOND"
},
"children": []
},
{
"name": "c13",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 64,
"unit": "MICROSECOND"
},
"children": []
},
{
"name": "c14",
"nullable": false,
"type": {
"name": "time",
"bitWidth": 64,
"unit": "NANOSECOND"
},
"children": []
},
{
"name": "c15",
"nullable": false,
"type": {
"name": "timestamp",
"unit": "SECOND"
},
"children": []
},
{
"name": "c16",
"nullable": false,
"type": {
"name": "timestamp",
"unit": "MILLISECOND",
"timezone": "UTC"
},
"children": []
},
{
"name": "c17",
"nullable": false,
"type": {
"name": "timestamp",
"unit": "MICROSECOND",
"timezone": "Africa/Johannesburg"
},
"children": []
},
{
"name": "c18",
"nullable": false,
"type": {
"name": "timestamp",
"unit": "NANOSECOND"
},
"children": []
},
{
"name": "c19",
"nullable": false,
"type": {
"name": "interval",
"unit": "DAY_TIME"
},
"children": []
},
{
"name": "c20",
"nullable": false,
"type": {
"name": "interval",
"unit": "YEAR_MONTH"
},
"children": []
},
{
"name": "c21",
"nullable": false,
"type": {
"name": "list"
},
"children": [
{
"name": "item",
"nullable": true,
"type": {
"name": "bool"
},
"children": []
}
]
},
{
"name": "c22",
"nullable": false,
"type": {
"name": "fixedsizelist",
"listSize": 5
},
"children": [
{
"name": "bools",
"nullable": false,
"type": {
"name": "bool"
},
"children": []
}
]
},
{
"name": "c23",
"nullable": true,
"type": {
"name": "list"
},
"children": [
{
"name": "inner_list",
"nullable": false,
"type": {
"name": "list"
},
"children": [
{
"name": "struct",
"nullable": true,
"type": {
"name": "struct"
},
"children": []
}
]
}
]
},
{
"name": "c24",
"nullable": false,
"type": {
"name": "struct"
},
"children": [
{
"name": "a",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
},
{
"name": "b",
"nullable": false,
"type": {
"name": "int",
"bitWidth": 16,
"isSigned": false
},
"children": []
}
]
},
{
"name": "c25",
"nullable": true,
"type": {
"name": "interval",
"unit": "YEAR_MONTH"
},
"children": []
},
{
"name": "c26",
"nullable": true,
"type": {
"name": "interval",
"unit": "DAY_TIME"
},
"children": []
},
{
"name": "c27",
"nullable": false,
"type": {
"name": "duration",
"unit": "SECOND"
},
"children": []
},
{
"name": "c28",
"nullable": false,
"type": {
"name": "duration",
"unit": "MILLISECOND"
},
"children": []
},
{
"name": "c29",
"nullable": false,
"type": {
"name": "duration",
"unit": "MICROSECOND"
},
"children": []
},
{
"name": "c30",
"nullable": false,
"type": {
"name": "duration",
"unit": "NANOSECOND"
},
"children": []
},
{
"name": "c31",
"nullable": true,
"children": [],
"type": {
"name": "utf8"
},
"dictionary": {
"id": 123,
"indexType": {
"name": "int",
"bitWidth": 32,
"isSigned": true
},
"isOrdered": true
}
},
{
"name": "c32",
"nullable": true,
"type": {
"name": "largebinary"
},
"children": []
},
{
"name": "c33",
"nullable": true,
"type": {
"name": "largeutf8"
},
"children": []
},
{
"name": "c34",
"nullable": true,
"type": {
"name": "largelist"
},
"children": [
{
"name": "inner_large_list",
"nullable": true,
"type": {
"name": "largelist"
},
"children": [
{
"name": "struct",
"nullable": false,
"type": {
"name": "struct"
},
"children": []
}
]
}
]
}
],
"metadata" : {
"Key": "Value"
}
}"#;
let value: Value = serde_json::from_str(&json).unwrap();
assert_eq!(expected, value);
// convert back to a schema
let value: Value = serde_json::from_str(&json).unwrap();
let schema2 = Schema::from(&value).unwrap();
assert_eq!(schema, schema2);
// Check that empty metadata produces empty value in JSON and can be parsed
let json = r#"{
"fields": [
{
"name": "c1",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
}
],
"metadata": {}
}"#;
let value: Value = serde_json::from_str(&json).unwrap();
let schema = Schema::from(&value).unwrap();
assert!(schema.metadata.is_empty());
// Check that metadata field is not required in the JSON.
let json = r#"{
"fields": [
{
"name": "c1",
"nullable": false,
"type": {
"name": "utf8"
},
"children": []
}
]
}"#;
let value: Value = serde_json::from_str(&json).unwrap();
let schema = Schema::from(&value).unwrap();
assert!(schema.metadata.is_empty());
}
#[test]
fn create_schema_string() {
let schema = person_schema();
assert_eq!(schema.to_string(),
"Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \
Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \
Field { name: \"address\", data_type: Struct([\
Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \
Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\
]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \
Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }")
}
#[test]
fn schema_field_accessors() {
let schema = person_schema();
// test schema accessors
assert_eq!(schema.fields().len(), 4);
// test field accessors
let first_name = &schema.fields()[0];
assert_eq!(first_name.name(), "first_name");
assert_eq!(first_name.data_type(), &DataType::Utf8);
assert_eq!(first_name.is_nullable(), false);
assert_eq!(first_name.dict_id(), None);
assert_eq!(first_name.dict_is_ordered(), None);
let metadata = first_name.metadata();
assert!(metadata.is_some());
let md = metadata.as_ref().unwrap();
assert_eq!(md.len(), 1);
let key = md.get("k");
assert!(key.is_some());
assert_eq!(key.unwrap(), "v");
let interests = &schema.fields()[3];
assert_eq!(interests.name(), "interests");
assert_eq!(
interests.data_type(),
&DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
);
assert_eq!(interests.dict_id(), Some(123));
assert_eq!(interests.dict_is_ordered(), Some(true));
}
#[test]
#[should_panic(
expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
)]
fn schema_index_of() {
let schema = person_schema();
assert_eq!(schema.index_of("first_name").unwrap(), 0);
assert_eq!(schema.index_of("last_name").unwrap(), 1);
schema.index_of("nickname").unwrap();
}
#[test]
#[should_panic(
expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]"
)]
fn schema_field_with_name() {
let schema = person_schema();
assert_eq!(
schema.field_with_name("first_name").unwrap().name(),
"first_name"
);
assert_eq!(
schema.field_with_name("last_name").unwrap().name(),
"last_name"
);
schema.field_with_name("nickname").unwrap();
}
#[test]
fn schema_field_with_dict_id() {
let schema = person_schema();
let fields_dict_123: Vec<_> = schema
.fields_with_dict_id(123)
.iter()
.map(|f| f.name())
.collect();
assert_eq!(fields_dict_123, vec!["interests"]);
assert!(schema.fields_with_dict_id(456).is_empty());
}
#[test]
fn schema_equality() {
let schema1 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::LargeBinary, true),
]);
let schema2 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::LargeBinary, true),
]);
assert_eq!(schema1, schema2);
let schema3 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float32, true),
]);
let schema4 = Schema::new(vec![
Field::new("C1", DataType::Utf8, false),
Field::new("C2", DataType::Float64, true),
]);
assert!(schema1 != schema3);
assert!(schema1 != schema4);
assert!(schema2 != schema3);
assert!(schema2 != schema4);
assert!(schema3 != schema4);
let mut f = Field::new("c1", DataType::Utf8, false);
f.set_metadata(Some(
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect(),
));
let schema5 = Schema::new(vec![
f,
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::LargeBinary, true),
]);
assert!(schema1 != schema5);
}
#[test]
fn test_arrow_native_type_to_json() {
assert_eq!(Some(Bool(true)), true.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
assert_eq!(
Some(VNumber(Number::from_f64(0.01f64).unwrap())),
0.01.into_json_value()
);
assert_eq!(
Some(VNumber(Number::from_f64(0.01f64).unwrap())),
0.01f64.into_json_value()
);
assert_eq!(None, NAN.into_json_value());
}
fn person_schema() -> Schema {
let kv_array = [("k".to_string(), "v".to_string())];
let field_metadata: BTreeMap<String, String> = kv_array.iter().cloned().collect();
let mut first_name = Field::new("first_name", DataType::Utf8, false);
first_name.set_metadata(Some(field_metadata));
Schema::new(vec![
first_name,
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
Field::new_dict(
"interests",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
true,
123,
true,
),
])
}
#[test]
fn test_try_merge_field_with_metadata() {
// 1. Different values for the same key should cause error.
let metadata1: BTreeMap<String, String> =
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect();
let mut f1 = Field::new("first_name", DataType::Utf8, false);
f1.set_metadata(Some(metadata1));
let metadata2: BTreeMap<String, String> =
[("foo".to_string(), "baz".to_string())]
.iter()
.cloned()
.collect();
let mut f2 = Field::new("first_name", DataType::Utf8, false);
f2.set_metadata(Some(metadata2));
assert!(
Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])])
.is_err()
);
// 2. None + Some
let mut f1 = Field::new("first_name", DataType::Utf8, false);
let metadata2: BTreeMap<String, String> =
[("missing".to_string(), "value".to_string())]
.iter()
.cloned()
.collect();
let mut f2 = Field::new("first_name", DataType::Utf8, false);
f2.set_metadata(Some(metadata2));
assert!(f1.try_merge(&f2).is_ok());
assert!(f1.metadata().is_some());
assert_eq!(
f1.metadata().as_ref().unwrap(),
f2.metadata().as_ref().unwrap()
);
// 3. Some + Some
let mut f1 = Field::new("first_name", DataType::Utf8, false);
f1.set_metadata(Some(
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect(),
));
let mut f2 = Field::new("first_name", DataType::Utf8, false);
f2.set_metadata(Some(
[("foo2".to_string(), "bar2".to_string())]
.iter()
.cloned()
.collect(),
));
assert!(f1.try_merge(&f2).is_ok());
assert!(f1.metadata().is_some());
assert_eq!(
f1.metadata().clone().unwrap(),
[
("foo".to_string(), "bar".to_string()),
("foo2".to_string(), "bar2".to_string())
]
.iter()
.cloned()
.collect()
);
// 4. Some + None.
let mut f1 = Field::new("first_name", DataType::Utf8, false);
f1.set_metadata(Some(
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect(),
));
let f2 = Field::new("first_name", DataType::Utf8, false);
assert!(f1.try_merge(&f2).is_ok());
assert!(f1.metadata().is_some());
assert_eq!(
f1.metadata().clone().unwrap(),
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect()
);
// 5. None + None.
let mut f1 = Field::new("first_name", DataType::Utf8, false);
let f2 = Field::new("first_name", DataType::Utf8, false);
assert!(f1.try_merge(&f2).is_ok());
assert!(f1.metadata().is_none());
}
#[test]
fn test_schema_merge() -> Result<()> {
let merged = Schema::try_merge(vec![
Schema::new(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]),
false,
),
]),
Schema::new_with_metadata(
vec![
// nullable merge
Field::new("last_name", DataType::Utf8, true),
Field::new(
"address",
DataType::Struct(vec![
// add new nested field
Field::new("street", DataType::Utf8, false),
// nullable merge on nested field
Field::new("zip", DataType::UInt16, true),
]),
false,
),
// new field
Field::new("number", DataType::Utf8, true),
],
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect::<HashMap<String, String>>(),
),
])?;
assert_eq!(
merged,
Schema::new_with_metadata(
vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, true),
Field::new(
"address",
DataType::Struct(vec![
Field::new("zip", DataType::UInt16, true),
Field::new("street", DataType::Utf8, false),
]),
false,
),
Field::new("number", DataType::Utf8, true),
],
[("foo".to_string(), "bar".to_string())]
.iter()
.cloned()
.collect::<HashMap<String, String>>()
)
);
// support merge union fields
assert_eq!(
Schema::try_merge(vec![
Schema::new(vec![Field::new(
"c1",
DataType::Union(vec![
Field::new("c11", DataType::Utf8, true),
Field::new("c12", DataType::Utf8, true),
]),
false
),]),
Schema::new(vec![Field::new(
"c1",
DataType::Union(vec![
Field::new("c12", DataType::Utf8, true),
Field::new("c13", DataType::Time64(TimeUnit::Second), true),
]),
false
),])
])?,
Schema::new(vec![Field::new(
"c1",
DataType::Union(vec![
Field::new("c11", DataType::Utf8, true),
Field::new("c12", DataType::Utf8, true),
Field::new("c13", DataType::Time64(TimeUnit::Second), true),
]),
false
),]),
);
// incompatible field should throw error
assert!(Schema::try_merge(vec![
Schema::new(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
]),
Schema::new(vec![Field::new("last_name", DataType::Int64, false),])
])
.is_err());
// incompatible metadata should throw error
assert!(Schema::try_merge(vec![
Schema::new_with_metadata(
vec![Field::new("first_name", DataType::Utf8, false)],
[("foo".to_string(), "bar".to_string()),]
.iter()
.cloned()
.collect::<HashMap<String, String>>()
),
Schema::new_with_metadata(
vec![Field::new("last_name", DataType::Utf8, false)],
[("foo".to_string(), "baz".to_string()),]
.iter()
.cloned()
.collect::<HashMap<String, String>>()
)
])
.is_err());
Ok(())
}
}