blob: 3d2b2a43dc1198dcdf609a9d78d408ca648e0244 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package schema_test
import (
"log"
"os"
"reflect"
"testing"
"github.com/apache/arrow/go/v6/parquet"
"github.com/apache/arrow/go/v6/parquet/schema"
"github.com/stretchr/testify/assert"
)
func ExampleNewSchemaFromStruct_primitives() {
type Schema struct {
Bool bool
Int8 int8
Uint16 uint16
Int32 int32
Int64 int64
Int96 parquet.Int96
Float float32
Double float64
ByteArray string
FixedLenByteArray [10]byte
}
sc, err := schema.NewSchemaFromStruct(Schema{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 Schema {
// required boolean field_id=-1 Bool;
// required int32 field_id=-1 Int8 (Int(bitWidth=8, isSigned=true));
// required int32 field_id=-1 Uint16 (Int(bitWidth=16, isSigned=false));
// required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
// required int64 field_id=-1 Int64 (Int(bitWidth=64, isSigned=true));
// required int96 field_id=-1 Int96;
// required float field_id=-1 Float;
// required double field_id=-1 Double;
// required byte_array field_id=-1 ByteArray;
// required fixed_len_byte_array field_id=-1 FixedLenByteArray;
// }
}
func ExampleNewSchemaFromStruct_convertedtypes() {
type ConvertedSchema struct {
Utf8 string `parquet:"name=utf8, converted=UTF8"`
Uint32 uint32 `parquet:"converted=INT_32"`
Date int32 `parquet:"name=date, converted=date"`
TimeMilli int32 `parquet:"name=timemilli, converted=TIME_MILLIS"`
TimeMicro int64 `parquet:"name=timemicro, converted=time_micros"`
TimeStampMilli int64 `parquet:"converted=timestamp_millis"`
TimeStampMicro int64 `parquet:"converted=timestamp_micros"`
Interval parquet.Int96 `parquet:"converted=INTERVAL"`
Decimal1 int32 `parquet:"converted=decimal, scale=2, precision=9"`
Decimal2 int64 `parquet:"converted=decimal, scale=2, precision=18"`
Decimal3 [12]byte `parquet:"converted=decimal, scale=2, precision=10"`
Decimal4 string `parquet:"converted=decimal, scale=2, precision=20"`
}
sc, err := schema.NewSchemaFromStruct(&ConvertedSchema{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 ConvertedSchema {
// required byte_array field_id=-1 utf8 (String);
// required int32 field_id=-1 Uint32 (Int(bitWidth=32, isSigned=true));
// required int32 field_id=-1 date (Date);
// required int32 field_id=-1 timemilli (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
// required int64 field_id=-1 timemicro (Time(isAdjustedToUTC=true, timeUnit=microseconds));
// required int64 field_id=-1 TimeStampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=true, force_set_converted_type=false));
// required int64 field_id=-1 TimeStampMicro (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
// required int96 field_id=-1 Interval;
// required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
// required int64 field_id=-1 Decimal2 (Decimal(precision=18, scale=2));
// required fixed_len_byte_array field_id=-1 Decimal3 (Decimal(precision=10, scale=2));
// required byte_array field_id=-1 Decimal4 (Decimal(precision=20, scale=2));
// }
}
func ExampleNewSchemaFromStruct_repetition() {
type RepetitionSchema struct {
List []int64 `parquet:"fieldid=1"`
Repeated []int64 `parquet:"repetition=repeated, fieldid=2"`
Optional *int64 `parquet:"fieldid=3"`
Required *int64 `parquet:"repetition=REQUIRED, fieldid=4"`
Opt int64 `parquet:"repetition=OPTIONAL, fieldid=5"`
}
sc, err := schema.NewSchemaFromStruct(RepetitionSchema{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 RepetitionSchema {
// required group field_id=1 List (List) {
// repeated group field_id=-1 list {
// required int64 field_id=-1 element (Int(bitWidth=64, isSigned=true));
// }
// }
// repeated int64 field_id=2 Repeated (Int(bitWidth=64, isSigned=true));
// optional int64 field_id=3 Optional (Int(bitWidth=64, isSigned=true));
// required int64 field_id=4 Required (Int(bitWidth=64, isSigned=true));
// optional int64 field_id=5 Opt (Int(bitWidth=64, isSigned=true));
// }
}
func ExampleNewSchemaFromStruct_logicaltypes() {
type LogicalTypes struct {
String []byte `parquet:"logical=String"`
Enum string `parquet:"logical=enum"`
Date int32 `parquet:"logical=date"`
Decimal1 int32 `parquet:"logical=decimal, precision=9, scale=2"`
Decimal2 int32 `parquet:"logical=decimal, logical.precision=9, scale=2"`
Decimal3 int32 `parquet:"logical=decimal, precision=5, logical.precision=9, scale=1, logical.scale=3"`
TimeMilliUTC int32 `parquet:"logical=TIME, logical.unit=millis"`
TimeMilli int32 `parquet:"logical=Time, logical.unit=millis, logical.isadjustedutc=false"`
TimeMicros int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=false"`
TimeMicrosUTC int64 `parquet:"logical=time, logical.unit=micros, logical.isadjustedutc=true"`
TimeNanos int64 `parquet:"logical=time, logical.unit=nanos"`
TimestampMilli int64 `parquet:"logical=timestamp, logical.unit=millis"`
TimestampMicrosNotUTC int64 `parquet:"logical=timestamp, logical.unit=micros, logical.isadjustedutc=false"`
TimestampNanos int64 `parquet:"logical=timestamp, logical.unit=nanos"`
JSON string `parquet:"logical=json"`
BSON []byte `parquet:"logical=BSON"`
UUID [16]byte `parquet:"logical=uuid"`
}
sc, err := schema.NewSchemaFromStruct(LogicalTypes{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 LogicalTypes {
// required byte_array field_id=-1 String (String);
// required byte_array field_id=-1 Enum (Enum);
// required int32 field_id=-1 Date (Date);
// required int32 field_id=-1 Decimal1 (Decimal(precision=9, scale=2));
// required int32 field_id=-1 Decimal2 (Decimal(precision=9, scale=2));
// required int32 field_id=-1 Decimal3 (Decimal(precision=9, scale=3));
// required int32 field_id=-1 TimeMilliUTC (Time(isAdjustedToUTC=true, timeUnit=milliseconds));
// required int32 field_id=-1 TimeMilli (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
// required int64 field_id=-1 TimeMicros (Time(isAdjustedToUTC=false, timeUnit=microseconds));
// required int64 field_id=-1 TimeMicrosUTC (Time(isAdjustedToUTC=true, timeUnit=microseconds));
// required int64 field_id=-1 TimeNanos (Time(isAdjustedToUTC=true, timeUnit=nanoseconds));
// required int64 field_id=-1 TimestampMilli (Timestamp(isAdjustedToUTC=true, timeUnit=milliseconds, is_from_converted_type=false, force_set_converted_type=false));
// required int64 field_id=-1 TimestampMicrosNotUTC (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
// required int64 field_id=-1 TimestampNanos (Timestamp(isAdjustedToUTC=true, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
// required byte_array field_id=-1 JSON (JSON);
// required byte_array field_id=-1 BSON (BSON);
// required fixed_len_byte_array field_id=-1 UUID (UUID);
// }
}
func ExampleNewSchemaFromStruct_physicaltype() {
type ChangeTypes struct {
Int32 int64 `parquet:"type=int32"`
FixedLen string `parquet:"type=fixed_len_byte_array, length=10"`
SliceAsFixed []byte `parquet:"type=fixed_len_byte_array, length=12"`
Int int `parquet:"type=int32"`
}
sc, err := schema.NewSchemaFromStruct(ChangeTypes{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 ChangeTypes {
// required int32 field_id=-1 Int32 (Int(bitWidth=32, isSigned=true));
// required fixed_len_byte_array field_id=-1 FixedLen;
// required fixed_len_byte_array field_id=-1 SliceAsFixed;
// required int32 field_id=-1 Int (Int(bitWidth=32, isSigned=true));
// }
}
func ExampleNewSchemaFromStruct_nestedtypes() {
type Other struct {
OptionalMap *map[string]*string `parquet:"valuerepetition=required, keylogical=String, valueconverted=BSON"`
}
type MyMap map[int32]string
type Nested struct {
SimpleMap map[int32]string
FixedLenMap map[string][]byte `parquet:"keytype=fixed_len_byte_array, keyfieldid=10, valuefieldid=11, keylength=10"`
DecimalMap map[int32]string `parquet:"logical=map, keyconverted=DECIMAL, keyscale=3, keyprecision=7, valuetype=fixed_len_byte_array, valuelength=4, valuelogical=decimal, valuelogical.precision=9, valuescale=2"`
OtherList []*Other
OtherRepeated []Other `parquet:"repetition=repeated"`
DateArray [5]int32 `parquet:"valuelogical=date, logical=list"`
DateMap MyMap `parquet:"keylogical=TIME, keylogical.unit=MILLIS, keylogical.isadjustedutc=false, valuelogical=enum"`
}
sc, err := schema.NewSchemaFromStruct(Nested{})
if err != nil {
log.Fatal(err)
}
schema.PrintSchema(sc.Root(), os.Stdout, 2)
// Output:
// repeated group field_id=-1 Nested {
// required group field_id=-1 SimpleMap (Map) {
// repeated group field_id=-1 key_value {
// required int32 field_id=-1 key (Int(bitWidth=32, isSigned=true));
// required byte_array field_id=-1 value;
// }
// }
// required group field_id=-1 FixedLenMap (Map) {
// repeated group field_id=-1 key_value {
// required fixed_len_byte_array field_id=10 key;
// required byte_array field_id=11 value;
// }
// }
// required group field_id=-1 DecimalMap (Map) {
// repeated group field_id=-1 key_value {
// required int32 field_id=-1 key (Decimal(precision=7, scale=3));
// required fixed_len_byte_array field_id=-1 value (Decimal(precision=9, scale=2));
// }
// }
// required group field_id=-1 OtherList (List) {
// repeated group field_id=-1 list {
// optional group field_id=-1 element {
// optional group field_id=-1 OptionalMap (Map) {
// repeated group field_id=-1 key_value {
// required byte_array field_id=-1 key (String);
// required byte_array field_id=-1 value (BSON);
// }
// }
// }
// }
// }
// repeated group field_id=-1 OtherRepeated {
// optional group field_id=-1 OptionalMap (Map) {
// repeated group field_id=-1 key_value {
// required byte_array field_id=-1 key (String);
// required byte_array field_id=-1 value (BSON);
// }
// }
// }
// required group field_id=-1 DateArray (List) {
// repeated group field_id=-1 list {
// required int32 field_id=-1 element (Date);
// }
// }
// required group field_id=-1 DateMap (Map) {
// repeated group field_id=-1 key_value {
// required int32 field_id=-1 key (Time(isAdjustedToUTC=false, timeUnit=milliseconds));
// required byte_array field_id=-1 value (Enum);
// }
// }
// }
}
func TestStructFromSchema(t *testing.T) {
root, err := schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{
schema.NewBooleanNode("bool", parquet.Repetitions.Required, -1),
schema.NewInt32Node("int32", parquet.Repetitions.Optional, -1),
schema.NewInt64Node("int64", parquet.Repetitions.Repeated, -1),
schema.NewInt96Node("int96", parquet.Repetitions.Required, -1),
schema.NewFloat32Node("float", parquet.Repetitions.Required, -1),
schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1),
schema.NewFixedLenByteArrayNode("fixedLen", parquet.Repetitions.Required, 10, -1),
}, -1)
assert.NoError(t, err)
sc := schema.NewSchema(root)
typ, err := schema.NewStructFromSchema(sc)
assert.NoError(t, err)
assert.Equal(t, reflect.Struct, typ.Kind())
assert.Equal(t, "struct { bool bool; int32 *int32; int64 []int64; int96 parquet.Int96; float float32; bytearray parquet.ByteArray; fixedLen parquet.FixedLenByteArray }",
typ.String())
}
func TestStructFromSchemaWithNesting(t *testing.T) {
type Other struct {
List *[]*float32
}
type Nested struct {
Nest []int32
OptionalNest []*int64
Mapped map[string]float32
Other []Other
Other2 Other
}
sc, err := schema.NewSchemaFromStruct(Nested{})
assert.NoError(t, err)
typ, err := schema.NewStructFromSchema(sc)
assert.NoError(t, err)
assert.Equal(t, "struct { Nest []int32; OptionalNest []*int64; Mapped map[string]float32; Other []struct { List *[]*float32 }; Other2 struct { List *[]*float32 } }",
typ.String())
}
func TestStructFromSchemaBackwardsCompatList(t *testing.T) {
tests := []struct {
name string
n schema.Node
expected string
}{
{"proper list", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required,
schema.FieldList{
schema.MustGroup(schema.NewGroupNode("list", parquet.Repetitions.Repeated, schema.FieldList{schema.NewBooleanNode("element", parquet.Repetitions.Optional, -1)}, -1)),
}, schema.NewListLogicalType(), -1)), "struct { my_list []*bool }"},
{"backward nullable list nonnull ints", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
schema.NewInt32Node("element", parquet.Repetitions.Repeated, -1),
}, schema.NewListLogicalType(), -1)), "struct { my_list *[]int32 }"},
{"backward nullable list tuple string int", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("element", parquet.Repetitions.Repeated, schema.FieldList{
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
}, -1)),
}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string; num int32 } }"},
{"list tuple string", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Required, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("array", parquet.Repetitions.Repeated, schema.FieldList{
schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
}, -1)),
}, schema.NewListLogicalType(), -1)), "struct { my_list []struct { str parquet.ByteArray } }"},
{"list tuple string my_list_tuple", schema.MustGroup(schema.NewGroupNodeLogical("my_list", parquet.Repetitions.Optional, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("my_list_tuple", parquet.Repetitions.Repeated, schema.FieldList{
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("str", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
}, -1)),
}, schema.NewListLogicalType(), -1)), "struct { my_list *[]struct { str string } }"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
assert.NoError(t, err)
assert.Equal(t, tt.expected, typ.String())
})
}
}
func TestStructFromSchemaMaps(t *testing.T) {
tests := []struct {
name string
n schema.Node
expected string
}{
{"map string int", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Required, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("key_value", parquet.Repetitions.Repeated, schema.FieldList{
schema.MustPrimitive(schema.NewPrimitiveNodeLogical("key", parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.ByteArray, 0, -1)),
schema.NewInt32Node("value", parquet.Repetitions.Optional, -1),
}, -1)),
}, schema.MapLogicalType{}, -1)), "struct { my_map map[string]*int32 }"},
{"nullable map string, int, required values", schema.MustGroup(schema.NewGroupNodeLogical("my_map", parquet.Repetitions.Optional, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
schema.NewByteArrayNode("str", parquet.Repetitions.Required, -1),
schema.NewInt32Node("num", parquet.Repetitions.Required, -1),
}, -1)),
}, schema.MapLogicalType{}, -1)), "struct { my_map *map[string]int32 }"},
{"map_key_value with missing value", schema.MustGroup(schema.NewGroupNodeConverted("my_map", parquet.Repetitions.Optional, schema.FieldList{
schema.MustGroup(schema.NewGroupNode("map", parquet.Repetitions.Repeated, schema.FieldList{
schema.NewByteArrayNode("key", parquet.Repetitions.Required, -1),
}, -1)),
}, schema.ConvertedTypes.MapKeyValue, -1)), "struct { my_map *map[string]bool }"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
typ, err := schema.NewStructFromSchema(schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", parquet.Repetitions.Repeated, schema.FieldList{tt.n}, -1))))
assert.NoError(t, err)
assert.Equal(t, tt.expected, typ.String())
})
}
}