blob: 53653bd7884709b2e2fffd631b3be7224ed9a449 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "parquet/metadata.h"
#include <gtest/gtest.h>
#include "parquet/schema.h"
#include "parquet/statistics.h"
#include "parquet/types.h"
namespace parquet {
namespace metadata {
TEST(Metadata, TestBuildAccess) {
parquet::schema::NodeVector fields;
parquet::schema::NodePtr root;
parquet::SchemaDescriptor schema;
WriterProperties::Builder prop_builder;
std::shared_ptr<WriterProperties> props =
prop_builder.version(ParquetVersion::PARQUET_2_0)->build();
fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
schema.Init(root);
int64_t nrows = 1000;
int32_t int_min = 100, int_max = 200;
EncodedStatistics stats_int;
stats_int.set_null_count(0)
.set_distinct_count(nrows)
.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
.set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
EncodedStatistics stats_float;
float float_min = 100.100f, float_max = 200.200f;
stats_float.set_null_count(0)
.set_distinct_count(nrows)
.set_min(std::string(reinterpret_cast<const char*>(&float_min), 4))
.set_max(std::string(reinterpret_cast<const char*>(&float_max), 4));
auto f_builder = FileMetaDataBuilder::Make(&schema, props);
auto rg1_builder = f_builder->AppendRowGroup();
auto rg2_builder = f_builder->AppendRowGroup();
// Write the metadata
// rowgroup1 metadata
auto col1_builder = rg1_builder->NextColumnChunk();
auto col2_builder = rg1_builder->NextColumnChunk();
// column metadata
col1_builder->SetStatistics(true, stats_int);
col2_builder->SetStatistics(true, stats_float);
col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false);
col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false);
rg1_builder->set_num_rows(nrows / 2);
rg1_builder->Finish(1024);
// rowgroup2 metadata
col1_builder = rg2_builder->NextColumnChunk();
col2_builder = rg2_builder->NextColumnChunk();
// column metadata
col1_builder->SetStatistics(true, stats_int);
col2_builder->SetStatistics(true, stats_float);
col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false);
col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false);
rg2_builder->set_num_rows(nrows / 2);
rg2_builder->Finish(1024);
// Read the metadata
auto f_accessor = f_builder->Finish();
// file metadata
ASSERT_EQ(nrows, f_accessor->num_rows());
ASSERT_LE(0, static_cast<int>(f_accessor->size()));
ASSERT_EQ(2, f_accessor->num_row_groups());
ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version());
ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by());
ASSERT_EQ(3, f_accessor->num_schema_elements());
// row group1 metadata
auto rg1_accessor = f_accessor->RowGroup(0);
ASSERT_EQ(2, rg1_accessor->num_columns());
ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
ASSERT_EQ(1024, rg1_accessor->total_byte_size());
auto rg1_column1 = rg1_accessor->ColumnChunk(0);
auto rg1_column2 = rg1_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg1_column1->is_stats_set());
ASSERT_EQ(true, rg1_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(0, rg1_column1->statistics()->null_count());
ASSERT_EQ(0, rg1_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
ASSERT_EQ(nrows / 2, rg1_column1->num_values());
ASSERT_EQ(nrows / 2, rg1_column2->num_values());
ASSERT_EQ(3, rg1_column1->encodings().size());
ASSERT_EQ(3, rg1_column2->encodings().size());
ASSERT_EQ(512, rg1_column1->total_compressed_size());
ASSERT_EQ(512, rg1_column2->total_compressed_size());
ASSERT_EQ(600, rg1_column1->total_uncompressed_size());
ASSERT_EQ(600, rg1_column2->total_uncompressed_size());
ASSERT_EQ(4, rg1_column1->dictionary_page_offset());
ASSERT_EQ(24, rg1_column2->dictionary_page_offset());
ASSERT_EQ(10, rg1_column1->data_page_offset());
ASSERT_EQ(30, rg1_column2->data_page_offset());
auto rg2_accessor = f_accessor->RowGroup(1);
ASSERT_EQ(2, rg2_accessor->num_columns());
ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
ASSERT_EQ(1024, rg2_accessor->total_byte_size());
auto rg2_column1 = rg2_accessor->ColumnChunk(0);
auto rg2_column2 = rg2_accessor->ColumnChunk(1);
ASSERT_EQ(true, rg2_column1->is_stats_set());
ASSERT_EQ(true, rg2_column2->is_stats_set());
ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin());
ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
ASSERT_EQ(0, rg2_column1->statistics()->null_count());
ASSERT_EQ(0, rg2_column2->statistics()->null_count());
ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());
ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count());
ASSERT_EQ(nrows / 2, rg2_column1->num_values());
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
ASSERT_EQ(3, rg2_column1->encodings().size());
ASSERT_EQ(3, rg2_column2->encodings().size());
ASSERT_EQ(512, rg2_column1->total_compressed_size());
ASSERT_EQ(512, rg2_column2->total_compressed_size());
ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
ASSERT_EQ(600, rg2_column2->total_uncompressed_size());
ASSERT_EQ(6, rg2_column1->dictionary_page_offset());
ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
ASSERT_EQ(10, rg2_column1->data_page_offset());
ASSERT_EQ(26, rg2_column2->data_page_offset());
}
TEST(Metadata, TestV1Version) {
// PARQUET-839
parquet::schema::NodeVector fields;
parquet::schema::NodePtr root;
parquet::SchemaDescriptor schema;
WriterProperties::Builder prop_builder;
std::shared_ptr<WriterProperties> props =
prop_builder.version(ParquetVersion::PARQUET_1_0)->build();
fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
schema.Init(root);
auto f_builder = FileMetaDataBuilder::Make(&schema, props);
// Read the metadata
auto f_accessor = f_builder->Finish();
// file metadata
ASSERT_EQ(ParquetVersion::PARQUET_1_0, f_accessor->version());
}
TEST(ApplicationVersion, Basics) {
ApplicationVersion version("parquet-mr version 1.7.9");
ApplicationVersion version1("parquet-mr version 1.8.0");
ApplicationVersion version2("parquet-cpp version 1.0.0");
ApplicationVersion version3("");
ApplicationVersion version4("parquet-mr version 1.5.0ab-cdh5.5.0+cd (build abcd)");
ApplicationVersion version5("parquet-mr");
ASSERT_EQ("parquet-mr", version.application_);
ASSERT_EQ(1, version.version.major);
ASSERT_EQ(7, version.version.minor);
ASSERT_EQ(9, version.version.patch);
ASSERT_EQ("parquet-cpp", version2.application_);
ASSERT_EQ(1, version2.version.major);
ASSERT_EQ(0, version2.version.minor);
ASSERT_EQ(0, version2.version.patch);
ASSERT_EQ("parquet-mr", version4.application_);
ASSERT_EQ("abcd", version4.build_);
ASSERT_EQ(1, version4.version.major);
ASSERT_EQ(5, version4.version.minor);
ASSERT_EQ(0, version4.version.patch);
ASSERT_EQ("ab", version4.version.unknown);
ASSERT_EQ("cdh5.5.0", version4.version.pre_release);
ASSERT_EQ("cd", version4.version.build_info);
ASSERT_EQ("parquet-mr", version5.application_);
ASSERT_EQ(0, version5.version.major);
ASSERT_EQ(0, version5.version.minor);
ASSERT_EQ(0, version5.version.patch);
ASSERT_EQ(true, version.VersionLt(version1));
ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, SortOrder::UNKNOWN));
ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, SortOrder::SIGNED));
ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, SortOrder::SIGNED));
ASSERT_TRUE(
version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED));
}
} // namespace metadata
} // namespace parquet