python/tests/parquet/test_parquet_to_iceberg.py - iceberg - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from iceberg.api import Schema
 from iceberg.api.types import (BooleanType,
                                DateType,
                                DecimalType,
                                DoubleType,
                                FloatType,
                                IntegerType,
                                ListType,
                                LongType,
                                NestedField,
                                StringType,
                                StructType,
                                TimestampType)
 from iceberg.parquet.parquet_to_iceberg import convert_parquet_to_iceberg


 def compare_schema(expected_schema, converted_schema):

     assert len(expected_schema) == len(converted_schema)
     for field in expected_schema.as_struct().fields:
         converted_field = converted_schema.find_field(field.id)
         assert converted_field is not None
         assert field.name == converted_field.name
         assert field.is_optional == converted_field.is_optional
         assert field.type == converted_field.type


 def test_primitive_types(primitive_type_test_parquet_file):
     expected_schema = Schema([NestedField.required(1, "int_col", IntegerType.get()),
                               NestedField.optional(2, "bigint_col", LongType.get()),
                               NestedField.optional(3, "str_col", StringType.get()),
                               NestedField.optional(4, "float_col", FloatType.get()),
                               NestedField.optional(5, "dbl_col", DoubleType.get()),
                               NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
                               NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
                               NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
                               NestedField.optional(9, "date_col", DateType.get()),
                               NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
                               NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
                               NestedField.optional(12, "bool_col", BooleanType.get())])
     compare_schema(expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))


 def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
     expected_schema = Schema([NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())),
                               NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())),
                               NestedField.optional(7, "struct_col",
                                                    StructType.of([NestedField.optional(8, "f1", IntegerType.get()),
                                                                   NestedField.optional(9, "f2", StringType.get())]))
                               ])
     converted_schema = convert_parquet_to_iceberg(unnested_complex_type_test_parquet_file)
     compare_schema(expected_schema, converted_schema)


 def test_nested_complex_types():
     # This is a placeholder until pyarrow fully supports reading and writing nested types.
     # Following a release containing this PR:
     # https://github.com/apache/arrow/pull/8177
     pass
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from iceberg.api import Schema
	from iceberg.api.types import (BooleanType,
	DateType,
	DecimalType,
	DoubleType,
	FloatType,
	IntegerType,
	ListType,
	LongType,
	NestedField,
	StringType,
	StructType,
	TimestampType)
	from iceberg.parquet.parquet_to_iceberg import convert_parquet_to_iceberg


	def compare_schema(expected_schema, converted_schema):

	assert len(expected_schema) == len(converted_schema)
	for field in expected_schema.as_struct().fields:
	converted_field = converted_schema.find_field(field.id)
	assert converted_field is not None
	assert field.name == converted_field.name
	assert field.is_optional == converted_field.is_optional
	assert field.type == converted_field.type


	def test_primitive_types(primitive_type_test_parquet_file):
	expected_schema = Schema([NestedField.required(1, "int_col", IntegerType.get()),
	NestedField.optional(2, "bigint_col", LongType.get()),
	NestedField.optional(3, "str_col", StringType.get()),
	NestedField.optional(4, "float_col", FloatType.get()),
	NestedField.optional(5, "dbl_col", DoubleType.get()),
	NestedField.optional(6, "decimal_col", DecimalType.of(9, 2)),
	NestedField.optional(7, "big_decimal_col", DecimalType.of(19, 5)),
	NestedField.optional(8, "huge_decimal_col", DecimalType.of(38, 9)),
	NestedField.optional(9, "date_col", DateType.get()),
	NestedField.optional(10, "ts_col", TimestampType.without_timezone()),
	NestedField.optional(11, "ts_wtz_col", TimestampType.with_timezone()),
	NestedField.optional(12, "bool_col", BooleanType.get())])
	compare_schema(expected_schema, convert_parquet_to_iceberg(primitive_type_test_parquet_file))


	def test_unnested_complex_types(unnested_complex_type_test_parquet_file):
	expected_schema = Schema([NestedField.optional(1, "list_int_col", ListType.of_optional(3, IntegerType.get())),
	NestedField.optional(4, "list_str_col", ListType.of_optional(6, StringType.get())),
	NestedField.optional(7, "struct_col",
	StructType.of([NestedField.optional(8, "f1", IntegerType.get()),
	NestedField.optional(9, "f2", StringType.get())]))
	])
	converted_schema = convert_parquet_to_iceberg(unnested_complex_type_test_parquet_file)
	compare_schema(expected_schema, converted_schema)


	def test_nested_complex_types():
	# This is a placeholder until pyarrow fully supports reading and writing nested types.
	# Following a release containing this PR:
	# https://github.com/apache/arrow/pull/8177
	pass