blob: 92ca00a76469ce0b720ee4d3b5ac08fe73bf6281 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.parquet;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.iceberg.Schema;
import org.apache.iceberg.mapping.MappingUtil;
import org.apache.iceberg.mapping.NameMapping;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Type.Repetition;
import org.apache.parquet.schema.Types.GroupBuilder;
import org.apache.parquet.schema.Types.ListBuilder;
import org.apache.parquet.schema.Types.MapBuilder;
import org.apache.parquet.schema.Types.PrimitiveBuilder;
import org.junit.Assert;
import org.junit.Test;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
public class TestParquetSchemaUtil {
private static final Types.StructType SUPPORTED_PRIMITIVES = Types.StructType.of(
required(100, "id", Types.LongType.get()),
optional(101, "data", Types.StringType.get()),
required(102, "b", Types.BooleanType.get()),
optional(103, "i", Types.IntegerType.get()),
required(104, "l", Types.LongType.get()),
optional(105, "f", Types.FloatType.get()),
required(106, "d", Types.DoubleType.get()),
optional(107, "date", Types.DateType.get()),
required(108, "ts", Types.TimestampType.withZone()),
required(110, "s", Types.StringType.get()),
required(112, "fixed", Types.FixedType.ofLength(7)),
optional(113, "bytes", Types.BinaryType.get()),
required(114, "dec_9_0", Types.DecimalType.of(9, 0)),
required(115, "dec_11_2", Types.DecimalType.of(11, 2)),
required(116, "dec_38_10", Types.DecimalType.of(38, 10)) // spark's maximum precision
);
@Test
public void testAssignIdsByNameMapping() {
Types.StructType structType = Types.StructType.of(
required(0, "id", Types.LongType.get()),
optional(1, "list_of_maps",
Types.ListType.ofOptional(2, Types.MapType.ofOptional(3, 4,
Types.StringType.get(),
SUPPORTED_PRIMITIVES))),
optional(5, "map_of_lists",
Types.MapType.ofOptional(6, 7,
Types.StringType.get(),
Types.ListType.ofOptional(8, SUPPORTED_PRIMITIVES))),
required(9, "list_of_lists",
Types.ListType.ofOptional(10, Types.ListType.ofOptional(11, SUPPORTED_PRIMITIVES))),
required(12, "map_of_maps",
Types.MapType.ofOptional(13, 14,
Types.StringType.get(),
Types.MapType.ofOptional(15, 16,
Types.StringType.get(),
SUPPORTED_PRIMITIVES))),
required(17, "list_of_struct_of_nested_types", Types.ListType.ofOptional(19, Types.StructType.of(
Types.NestedField.required(20, "m1", Types.MapType.ofOptional(21, 22,
Types.StringType.get(),
SUPPORTED_PRIMITIVES)),
Types.NestedField.optional(23, "l1", Types.ListType.ofRequired(24, SUPPORTED_PRIMITIVES)),
Types.NestedField.required(25, "l2", Types.ListType.ofRequired(26, SUPPORTED_PRIMITIVES)),
Types.NestedField.optional(27, "m2", Types.MapType.ofOptional(28, 29,
Types.StringType.get(),
SUPPORTED_PRIMITIVES))
)))
);
Schema schema = new Schema(TypeUtil.assignFreshIds(structType, new AtomicInteger(0)::incrementAndGet)
.asStructType().fields());
NameMapping nameMapping = MappingUtil.create(schema);
MessageType messageTypeWithIds = ParquetSchemaUtil.convert(schema, "parquet_type");
MessageType messageTypeWithIdsFromNameMapping = ParquetSchemaUtil
.applyNameMapping(RemoveIds.removeIds(messageTypeWithIds), nameMapping);
Assert.assertEquals(messageTypeWithIds, messageTypeWithIdsFromNameMapping);
}
@Test
public void testSchemaConversionWithoutAssigningIds() {
MessageType messageType = new MessageType("test",
primitive(1, "int_col", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(2, "double_col", PrimitiveTypeName.DOUBLE, Repetition.OPTIONAL),
primitive(null, "long_col", PrimitiveTypeName.INT64, Repetition.OPTIONAL),
struct(3, "struct_col_1", Repetition.REQUIRED,
primitive(4, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL),
primitive(5, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)),
struct(6, "struct_col_2", Repetition.OPTIONAL,
primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL),
primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL)),
list(null, "list_col_1", Repetition.REQUIRED,
primitive(7, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)),
list(8, "list_col_2", Repetition.REQUIRED,
primitive(null, "i", PrimitiveTypeName.INT32, Repetition.OPTIONAL)),
list(9, "list_col_3", Repetition.OPTIONAL,
struct(null, "s", Repetition.REQUIRED,
primitive(10, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(11, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))),
list(12, "list_col_4", Repetition.REQUIRED,
struct(13, "s", Repetition.REQUIRED,
primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(null, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))),
list(14, "list_col_5", Repetition.OPTIONAL,
struct(15, "s", Repetition.REQUIRED,
primitive(16, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(17, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL))),
map(null, "map_col_1", Repetition.REQUIRED,
primitive(18, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(19, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)),
map(20, "map_col_2", Repetition.OPTIONAL,
primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(21, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)),
map(22, "map_col_3", Repetition.REQUIRED,
primitive(null, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(null, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)),
map(23, "map_col_4", Repetition.OPTIONAL,
primitive(24, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED),
struct(25, "s", Repetition.REQUIRED,
primitive(null, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(26, "n2", PrimitiveTypeName.INT64, Repetition.OPTIONAL),
primitive(null, "n3", PrimitiveTypeName.INT64, Repetition.OPTIONAL))),
map(27, "map_col_5", Repetition.REQUIRED,
primitive(28, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED),
primitive(29, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED))
);
Schema expectedSchema = new Schema(
required(1, "int_col", Types.IntegerType.get()),
optional(2, "double_col", Types.DoubleType.get()),
required(3, "struct_col_1", Types.StructType.of(
required(4, "n1", Types.IntegerType.get()),
optional(5, "n3", Types.LongType.get())
)),
optional(14, "list_col_5", Types.ListType.ofRequired(15,
Types.StructType.of(
required(16, "n1", Types.IntegerType.get()),
optional(17, "n2", Types.LongType.get())
)
)),
optional(23, "map_col_4", Types.MapType.ofRequired(24, 25,
Types.IntegerType.get(),
Types.StructType.of(
optional(26, "n2", Types.LongType.get())
)
)),
required(27, "map_col_5", Types.MapType.ofRequired(28, 29,
Types.IntegerType.get(),
Types.IntegerType.get()
))
);
Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageType);
Assert.assertEquals("Schema must match", expectedSchema.asStruct(), actualSchema.asStruct());
}
private Type primitive(Integer id, String name, PrimitiveTypeName typeName, Repetition repetition) {
PrimitiveBuilder<PrimitiveType> builder = org.apache.parquet.schema.Types.primitive(typeName, repetition);
if (id != null) {
builder.id(id);
}
return builder.named(name);
}
private Type struct(Integer id, String name, Repetition repetition, Type... types) {
GroupBuilder<GroupType> builder = org.apache.parquet.schema.Types.buildGroup(repetition);
builder.addFields(types);
if (id != null) {
builder.id(id);
}
return builder.named(name);
}
private Type list(Integer id, String name, Repetition repetition, Type elementType) {
ListBuilder<GroupType> builder = org.apache.parquet.schema.Types.list(repetition);
builder.element(elementType);
if (id != null) {
builder.id(id);
}
return builder.named(name);
}
private Type map(Integer id, String name, Repetition repetition, Type keyType, Type valueType) {
MapBuilder<GroupType> builder = org.apache.parquet.schema.Types.map(repetition);
builder.key(keyType);
builder.value(valueType);
if (id != null) {
builder.id(id);
}
return builder.named(name);
}
}