blob: ad5ae4419fc9901f128e3523bd6115777cd23c89 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module contains the logical parquet-cpp types (independent of Thrift
// structures), schema nodes, and related type tools
#ifndef PARQUET_SCHEMA_INTERNAL_H
#define PARQUET_SCHEMA_INTERNAL_H
#include <cstdint>
#include <memory>
#include <string>
#include <unordered_set>
#include <vector>
#include "parquet/platform.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
namespace format {
class SchemaElement;
}
namespace schema {
inline bool str_endswith_tuple(const std::string& str) {
if (str.size() >= 6) {
return str.substr(str.size() - 6, 6) == "_tuple";
}
return false;
}
// Special case mentioned in the format spec:
// If the name is array or ends in _tuple, this should be a list of struct
// even for single child elements.
inline bool HasStructListName(const GroupNode& node) {
return (node.name() == "array" || str_endswith_tuple(node.name()));
}
// TODO(itaiin): This aux. function is to be deleted once repeated structs are supported
inline bool IsSimpleStruct(const Node* node) {
if (!node->is_group()) return false;
if (node->is_repeated()) return false;
if (node->converted_type() == ConvertedType::LIST) return false;
// Special case mentioned in the format spec:
// If the name is array or ends in _tuple, this should be a list of struct
// even for single child elements.
auto group = static_cast<const GroupNode*>(node);
if (group->field_count() == 1 && HasStructListName(*group)) return false;
return true;
}
// Coalesce a list of schema fields indices which are the roots of the
// columns referred by a list of column indices
inline bool ColumnIndicesToFieldIndices(const SchemaDescriptor& descr,
const std::vector<int>& column_indices,
std::vector<int>* out) {
const GroupNode* group = descr.group_node();
std::unordered_set<int> already_added;
out->clear();
for (auto& column_idx : column_indices) {
auto field_node = descr.GetColumnRoot(column_idx);
auto field_idx = group->FieldIndex(*field_node);
if (field_idx < 0) {
return false;
}
auto insertion = already_added.insert(field_idx);
if (insertion.second) {
out->push_back(field_idx);
}
}
return true;
}
// ----------------------------------------------------------------------
// Conversion from Parquet Thrift metadata
std::shared_ptr<SchemaDescriptor> FromParquet(
const std::vector<format::SchemaElement>& schema);
class FlatSchemaConverter {
public:
FlatSchemaConverter(const format::SchemaElement* elements, int length)
: elements_(elements), length_(length), pos_(0), current_id_(0) {}
std::unique_ptr<Node> Convert();
private:
const format::SchemaElement* elements_;
int length_;
int pos_;
int current_id_;
int next_id() { return current_id_++; }
const format::SchemaElement& Next();
std::unique_ptr<Node> NextNode();
};
// ----------------------------------------------------------------------
// Conversion to Parquet Thrift metadata
void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
// Converts nested parquet schema back to a flat vector of Thrift structs
class SchemaFlattener {
public:
SchemaFlattener(const GroupNode* schema, std::vector<format::SchemaElement>* out);
void Flatten();
private:
const GroupNode* root_;
std::vector<format::SchemaElement>* elements_;
};
} // namespace schema
} // namespace parquet
#endif // PARQUET_SCHEMA_INTERNAL_H