| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "arrow/type.h" |
| |
| #include <algorithm> |
| #include <climits> |
| #include <cstddef> |
| #include <limits> |
| #include <ostream> |
| #include <sstream> // IWYU pragma: keep |
| #include <string> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <utility> |
| #include <vector> |
| |
| #include "arrow/array.h" |
| #include "arrow/compare.h" |
| #include "arrow/record_batch.h" |
| #include "arrow/result.h" |
| #include "arrow/status.h" |
| #include "arrow/util/checked_cast.h" |
| #include "arrow/util/hash_util.h" |
| #include "arrow/util/hashing.h" |
| #include "arrow/util/key_value_metadata.h" |
| #include "arrow/util/logging.h" |
| #include "arrow/util/make_unique.h" |
| #include "arrow/util/range.h" |
| #include "arrow/util/vector.h" |
| #include "arrow/visitor_inline.h" |
| |
| namespace arrow { |
| |
| constexpr Type::type NullType::type_id; |
| constexpr Type::type ListType::type_id; |
| constexpr Type::type LargeListType::type_id; |
| |
| constexpr Type::type MapType::type_id; |
| |
| constexpr Type::type FixedSizeListType::type_id; |
| |
| constexpr Type::type BinaryType::type_id; |
| |
| constexpr Type::type LargeBinaryType::type_id; |
| |
| constexpr Type::type StringType::type_id; |
| |
| constexpr Type::type LargeStringType::type_id; |
| |
| constexpr Type::type FixedSizeBinaryType::type_id; |
| |
| constexpr Type::type StructType::type_id; |
| |
| constexpr Type::type Decimal128Type::type_id; |
| |
| constexpr Type::type Decimal256Type::type_id; |
| |
| constexpr Type::type SparseUnionType::type_id; |
| |
| constexpr Type::type DenseUnionType::type_id; |
| |
| constexpr Type::type Date32Type::type_id; |
| |
| constexpr Type::type Date64Type::type_id; |
| |
| constexpr Type::type Time32Type::type_id; |
| |
| constexpr Type::type Time64Type::type_id; |
| |
| constexpr Type::type TimestampType::type_id; |
| |
| constexpr Type::type MonthIntervalType::type_id; |
| |
| constexpr Type::type DayTimeIntervalType::type_id; |
| |
| constexpr Type::type DurationType::type_id; |
| |
| constexpr Type::type DictionaryType::type_id; |
| |
| namespace internal { |
| |
| struct TypeIdToTypeNameVisitor { |
| std::string out; |
| |
| template <typename ArrowType> |
| Status Visit(const ArrowType*) { |
| out = ArrowType::type_name(); |
| return Status::OK(); |
| } |
| }; |
| |
| std::string ToTypeName(Type::type id) { |
| TypeIdToTypeNameVisitor visitor; |
| |
| ARROW_CHECK_OK(VisitTypeIdInline(id, &visitor)); |
| return std::move(visitor.out); |
| } |
| |
| std::string ToString(Type::type id) { |
| switch (id) { |
| #define TO_STRING_CASE(_id) \ |
| case Type::_id: \ |
| return ARROW_STRINGIFY(_id); |
| |
| TO_STRING_CASE(NA) |
| TO_STRING_CASE(BOOL) |
| TO_STRING_CASE(INT8) |
| TO_STRING_CASE(INT16) |
| TO_STRING_CASE(INT32) |
| TO_STRING_CASE(INT64) |
| TO_STRING_CASE(UINT8) |
| TO_STRING_CASE(UINT16) |
| TO_STRING_CASE(UINT32) |
| TO_STRING_CASE(UINT64) |
| TO_STRING_CASE(HALF_FLOAT) |
| TO_STRING_CASE(FLOAT) |
| TO_STRING_CASE(DOUBLE) |
| TO_STRING_CASE(DECIMAL128) |
| TO_STRING_CASE(DECIMAL256) |
| TO_STRING_CASE(DATE32) |
| TO_STRING_CASE(DATE64) |
| TO_STRING_CASE(TIME32) |
| TO_STRING_CASE(TIME64) |
| TO_STRING_CASE(TIMESTAMP) |
| TO_STRING_CASE(INTERVAL_DAY_TIME) |
| TO_STRING_CASE(INTERVAL_MONTHS) |
| TO_STRING_CASE(DURATION) |
| TO_STRING_CASE(STRING) |
| TO_STRING_CASE(BINARY) |
| TO_STRING_CASE(LARGE_STRING) |
| TO_STRING_CASE(LARGE_BINARY) |
| TO_STRING_CASE(FIXED_SIZE_BINARY) |
| TO_STRING_CASE(STRUCT) |
| TO_STRING_CASE(LIST) |
| TO_STRING_CASE(LARGE_LIST) |
| TO_STRING_CASE(FIXED_SIZE_LIST) |
| TO_STRING_CASE(MAP) |
| TO_STRING_CASE(DENSE_UNION) |
| TO_STRING_CASE(SPARSE_UNION) |
| TO_STRING_CASE(DICTIONARY) |
| TO_STRING_CASE(EXTENSION) |
| |
| #undef TO_STRING_CASE |
| |
| default: |
| ARROW_LOG(FATAL) << "Unhandled type id: " << id; |
| return ""; |
| } |
| } |
| |
| std::string ToString(TimeUnit::type unit) { |
| switch (unit) { |
| case TimeUnit::SECOND: |
| return "s"; |
| case TimeUnit::MILLI: |
| return "ms"; |
| case TimeUnit::MICRO: |
| return "us"; |
| case TimeUnit::NANO: |
| return "ns"; |
| default: |
| DCHECK(false); |
| return ""; |
| } |
| } |
| |
| int GetByteWidth(const DataType& type) { |
| const auto& fw_type = checked_cast<const FixedWidthType&>(type); |
| return fw_type.bit_width() / CHAR_BIT; |
| } |
| |
| } // namespace internal |
| |
| namespace { |
| |
| struct PhysicalTypeVisitor { |
| const std::shared_ptr<DataType>& real_type; |
| std::shared_ptr<DataType> result; |
| |
| Status Visit(const DataType&) { |
| result = real_type; |
| return Status::OK(); |
| } |
| |
| template <typename Type, typename PhysicalType = typename Type::PhysicalType> |
| Status Visit(const Type&) { |
| result = TypeTraits<PhysicalType>::type_singleton(); |
| return Status::OK(); |
| } |
| }; |
| |
| } // namespace |
| |
| std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) { |
| PhysicalTypeVisitor visitor{real_type, {}}; |
| ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor)); |
| return std::move(visitor.result); |
| } |
| |
| namespace { |
| |
| using internal::checked_cast; |
| |
| // Merges `existing` and `other` if one of them is of NullType, otherwise |
| // returns nullptr. |
| // - if `other` if of NullType or is nullable, the unified field will be nullable. |
| // - if `existing` is of NullType but other is not, the unified field will |
| // have `other`'s type and will be nullable |
| std::shared_ptr<Field> MaybePromoteNullTypes(const Field& existing, const Field& other) { |
| if (existing.type()->id() != Type::NA && other.type()->id() != Type::NA) { |
| return nullptr; |
| } |
| if (existing.type()->id() == Type::NA) { |
| return other.WithNullable(true)->WithMetadata(existing.metadata()); |
| } |
| // `other` must be null. |
| return existing.WithNullable(true); |
| } |
| } // namespace |
| |
| Field::~Field() {} |
| |
| bool Field::HasMetadata() const { |
| return (metadata_ != nullptr) && (metadata_->size() > 0); |
| } |
| |
| std::shared_ptr<Field> Field::WithMetadata( |
| const std::shared_ptr<const KeyValueMetadata>& metadata) const { |
| return std::make_shared<Field>(name_, type_, nullable_, metadata); |
| } |
| |
| std::shared_ptr<Field> Field::WithMergedMetadata( |
| const std::shared_ptr<const KeyValueMetadata>& metadata) const { |
| std::shared_ptr<const KeyValueMetadata> merged_metadata; |
| if (metadata_) { |
| merged_metadata = metadata_->Merge(*metadata); |
| } else { |
| merged_metadata = metadata; |
| } |
| return std::make_shared<Field>(name_, type_, nullable_, merged_metadata); |
| } |
| |
| std::shared_ptr<Field> Field::RemoveMetadata() const { |
| return std::make_shared<Field>(name_, type_, nullable_); |
| } |
| |
| std::shared_ptr<Field> Field::WithType(const std::shared_ptr<DataType>& type) const { |
| return std::make_shared<Field>(name_, type, nullable_, metadata_); |
| } |
| |
| std::shared_ptr<Field> Field::WithName(const std::string& name) const { |
| return std::make_shared<Field>(name, type_, nullable_, metadata_); |
| } |
| |
| std::shared_ptr<Field> Field::WithNullable(const bool nullable) const { |
| return std::make_shared<Field>(name_, type_, nullable, metadata_); |
| } |
| |
| Result<std::shared_ptr<Field>> Field::MergeWith(const Field& other, |
| MergeOptions options) const { |
| if (name() != other.name()) { |
| return Status::Invalid("Field ", name(), " doesn't have the same name as ", |
| other.name()); |
| } |
| |
| if (Equals(other, /*check_metadata=*/false)) { |
| return Copy(); |
| } |
| |
| if (options.promote_nullability) { |
| if (type()->Equals(other.type())) { |
| return Copy()->WithNullable(nullable() || other.nullable()); |
| } |
| std::shared_ptr<Field> promoted = MaybePromoteNullTypes(*this, other); |
| if (promoted) return promoted; |
| } |
| |
| return Status::Invalid("Unable to merge: Field ", name(), |
| " has incompatible types: ", type()->ToString(), " vs ", |
| other.type()->ToString()); |
| } |
| |
| Result<std::shared_ptr<Field>> Field::MergeWith(const std::shared_ptr<Field>& other, |
| MergeOptions options) const { |
| DCHECK_NE(other, nullptr); |
| return MergeWith(*other, options); |
| } |
| |
| std::vector<std::shared_ptr<Field>> Field::Flatten() const { |
| std::vector<std::shared_ptr<Field>> flattened; |
| if (type_->id() == Type::STRUCT) { |
| for (const auto& child : type_->fields()) { |
| auto flattened_child = child->Copy(); |
| flattened.push_back(flattened_child); |
| flattened_child->name_.insert(0, name() + "."); |
| flattened_child->nullable_ |= nullable_; |
| } |
| } else { |
| flattened.push_back(this->Copy()); |
| } |
| return flattened; |
| } |
| |
| std::shared_ptr<Field> Field::Copy() const { |
| return ::arrow::field(name_, type_, nullable_, metadata_); |
| } |
| |
| bool Field::Equals(const Field& other, bool check_metadata) const { |
| if (this == &other) { |
| return true; |
| } |
| if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && |
| this->type_->Equals(*other.type_.get(), check_metadata)) { |
| if (!check_metadata) { |
| return true; |
| } else if (this->HasMetadata() && other.HasMetadata()) { |
| return metadata_->Equals(*other.metadata_); |
| } else if (!this->HasMetadata() && !other.HasMetadata()) { |
| return true; |
| } else { |
| return false; |
| } |
| } |
| return false; |
| } |
| |
| bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const { |
| return Equals(*other.get(), check_metadata); |
| } |
| |
| bool Field::IsCompatibleWith(const Field& other) const { return MergeWith(other).ok(); } |
| |
| bool Field::IsCompatibleWith(const std::shared_ptr<Field>& other) const { |
| DCHECK_NE(other, nullptr); |
| return IsCompatibleWith(*other); |
| } |
| |
| std::string Field::ToString(bool show_metadata) const { |
| std::stringstream ss; |
| ss << name_ << ": " << type_->ToString(); |
| if (!nullable_) { |
| ss << " not null"; |
| } |
| if (show_metadata && metadata_) { |
| ss << metadata_->ToString(); |
| } |
| return ss.str(); |
| } |
| |
| DataType::~DataType() {} |
| |
| bool DataType::Equals(const DataType& other, bool check_metadata) const { |
| return TypeEquals(*this, other, check_metadata); |
| } |
| |
| bool DataType::Equals(const std::shared_ptr<DataType>& other) const { |
| if (!other) { |
| return false; |
| } |
| return Equals(*other.get()); |
| } |
| |
| size_t DataType::Hash() const { |
| static constexpr size_t kHashSeed = 0; |
| size_t result = kHashSeed; |
| internal::hash_combine(result, this->ComputeFingerprint()); |
| return result; |
| } |
| |
| std::ostream& operator<<(std::ostream& os, const DataType& type) { |
| os << type.ToString(); |
| return os; |
| } |
| |
| FloatingPointType::Precision HalfFloatType::precision() const { |
| return FloatingPointType::HALF; |
| } |
| |
| FloatingPointType::Precision FloatType::precision() const { |
| return FloatingPointType::SINGLE; |
| } |
| |
| FloatingPointType::Precision DoubleType::precision() const { |
| return FloatingPointType::DOUBLE; |
| } |
| |
| std::string ListType::ToString() const { |
| std::stringstream s; |
| s << "list<" << value_field()->ToString() << ">"; |
| return s.str(); |
| } |
| |
| std::string LargeListType::ToString() const { |
| std::stringstream s; |
| s << "large_list<" << value_field()->ToString() << ">"; |
| return s.str(); |
| } |
| |
| MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type, |
| bool keys_sorted) |
| : MapType(::arrow::field("key", std::move(key_type), false), |
| ::arrow::field("value", std::move(item_type)), keys_sorted) {} |
| |
| MapType::MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field, |
| bool keys_sorted) |
| : MapType(::arrow::field("key", std::move(key_type), false), std::move(item_field), |
| keys_sorted) {} |
| |
| MapType::MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field, |
| bool keys_sorted) |
| : MapType( |
| ::arrow::field("entries", |
| struct_({std::move(key_field), std::move(item_field)}), false), |
| keys_sorted) {} |
| |
| MapType::MapType(std::shared_ptr<Field> value_field, bool keys_sorted) |
| : ListType(std::move(value_field)), keys_sorted_(keys_sorted) { |
| id_ = type_id; |
| } |
| |
| Result<std::shared_ptr<DataType>> MapType::Make(std::shared_ptr<Field> value_field, |
| bool keys_sorted) { |
| const auto& value_type = *value_field->type(); |
| if (value_field->nullable() || value_type.id() != Type::STRUCT) { |
| return Status::TypeError("Map entry field should be non-nullable struct"); |
| } |
| const auto& struct_type = checked_cast<const StructType&>(value_type); |
| if (struct_type.num_fields() != 2) { |
| return Status::TypeError("Map entry field should have two children (got ", |
| struct_type.num_fields(), ")"); |
| } |
| if (struct_type.field(0)->nullable()) { |
| return Status::TypeError("Map key field should be non-nullable"); |
| } |
| return std::make_shared<MapType>(std::move(value_field), keys_sorted); |
| } |
| |
| std::string MapType::ToString() const { |
| std::stringstream s; |
| |
| const auto print_field_name = [](std::ostream& os, const Field& field, |
| const char* std_name) { |
| if (field.name() != std_name) { |
| os << " ('" << field.name() << "')"; |
| } |
| }; |
| const auto print_field = [&](std::ostream& os, const Field& field, |
| const char* std_name) { |
| os << field.type()->ToString(); |
| print_field_name(os, field, std_name); |
| }; |
| |
| s << "map<"; |
| print_field(s, *key_field(), "key"); |
| s << ", "; |
| print_field(s, *item_field(), "value"); |
| if (keys_sorted_) { |
| s << ", keys_sorted"; |
| } |
| print_field_name(s, *value_field(), "entries"); |
| s << ">"; |
| return s.str(); |
| } |
| |
| std::string FixedSizeListType::ToString() const { |
| std::stringstream s; |
| s << "fixed_size_list<" << value_field()->ToString() << ">[" << list_size_ << "]"; |
| return s.str(); |
| } |
| |
| std::string BinaryType::ToString() const { return "binary"; } |
| |
| std::string LargeBinaryType::ToString() const { return "large_binary"; } |
| |
| std::string StringType::ToString() const { return "string"; } |
| |
| std::string LargeStringType::ToString() const { return "large_string"; } |
| |
| int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } |
| |
| Result<std::shared_ptr<DataType>> FixedSizeBinaryType::Make(int32_t byte_width) { |
| if (byte_width < 0) { |
| return Status::Invalid("Negative FixedSizeBinaryType byte width"); |
| } |
| if (byte_width > std::numeric_limits<int>::max() / CHAR_BIT) { |
| // bit_width() would overflow |
| return Status::Invalid("byte width of FixedSizeBinaryType too large"); |
| } |
| return std::make_shared<FixedSizeBinaryType>(byte_width); |
| } |
| |
| std::string FixedSizeBinaryType::ToString() const { |
| std::stringstream ss; |
| ss << "fixed_size_binary[" << byte_width_ << "]"; |
| return ss.str(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Date types |
| |
| DateType::DateType(Type::type type_id) : TemporalType(type_id) {} |
| |
| Date32Type::Date32Type() : DateType(Type::DATE32) {} |
| |
| Date64Type::Date64Type() : DateType(Type::DATE64) {} |
| |
| std::string Date64Type::ToString() const { return std::string("date64[ms]"); } |
| |
| std::string Date32Type::ToString() const { return std::string("date32[day]"); } |
| |
| // ---------------------------------------------------------------------- |
| // Time types |
| |
| TimeType::TimeType(Type::type type_id, TimeUnit::type unit) |
| : TemporalType(type_id), unit_(unit) {} |
| |
| Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) { |
| ARROW_CHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI) |
| << "Must be seconds or milliseconds"; |
| } |
| |
| std::string Time32Type::ToString() const { |
| std::stringstream ss; |
| ss << "time32[" << this->unit_ << "]"; |
| return ss.str(); |
| } |
| |
| Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) { |
| ARROW_CHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO) |
| << "Must be microseconds or nanoseconds"; |
| } |
| |
| std::string Time64Type::ToString() const { |
| std::stringstream ss; |
| ss << "time64[" << this->unit_ << "]"; |
| return ss.str(); |
| } |
| |
| std::ostream& operator<<(std::ostream& os, TimeUnit::type unit) { |
| switch (unit) { |
| case TimeUnit::SECOND: |
| os << "s"; |
| break; |
| case TimeUnit::MILLI: |
| os << "ms"; |
| break; |
| case TimeUnit::MICRO: |
| os << "us"; |
| break; |
| case TimeUnit::NANO: |
| os << "ns"; |
| break; |
| } |
| return os; |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Timestamp types |
| |
| std::string TimestampType::ToString() const { |
| std::stringstream ss; |
| ss << "timestamp[" << this->unit_; |
| if (this->timezone_.size() > 0) { |
| ss << ", tz=" << this->timezone_; |
| } |
| ss << "]"; |
| return ss.str(); |
| } |
| |
| // Duration types |
| std::string DurationType::ToString() const { |
| std::stringstream ss; |
| ss << "duration[" << this->unit_ << "]"; |
| return ss.str(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Union type |
| |
| constexpr int8_t UnionType::kMaxTypeCode; |
| constexpr int UnionType::kInvalidChildId; |
| |
| UnionMode::type UnionType::mode() const { |
| return id_ == Type::SPARSE_UNION ? UnionMode::SPARSE : UnionMode::DENSE; |
| } |
| |
| UnionType::UnionType(std::vector<std::shared_ptr<Field>> fields, |
| std::vector<int8_t> type_codes, Type::type id) |
| : NestedType(id), |
| type_codes_(std::move(type_codes)), |
| child_ids_(kMaxTypeCode + 1, kInvalidChildId) { |
| children_ = std::move(fields); |
| DCHECK_OK(ValidateParameters(children_, type_codes_, mode())); |
| for (int child_id = 0; child_id < static_cast<int>(type_codes_.size()); ++child_id) { |
| const auto type_code = type_codes_[child_id]; |
| child_ids_[type_code] = child_id; |
| } |
| } |
| |
| Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields, |
| const std::vector<int8_t>& type_codes, |
| UnionMode::type mode) { |
| if (fields.size() != type_codes.size()) { |
| return Status::Invalid("Union should get the same number of fields as type codes"); |
| } |
| for (const auto type_code : type_codes) { |
| if (type_code < 0 || type_code > kMaxTypeCode) { |
| return Status::Invalid("Union type code out of bounds"); |
| } |
| } |
| return Status::OK(); |
| } |
| |
| DataTypeLayout UnionType::layout() const { |
| if (mode() == UnionMode::SPARSE) { |
| return DataTypeLayout( |
| {DataTypeLayout::AlwaysNull(), DataTypeLayout::FixedWidth(sizeof(uint8_t))}); |
| } else { |
| return DataTypeLayout({DataTypeLayout::AlwaysNull(), |
| DataTypeLayout::FixedWidth(sizeof(uint8_t)), |
| DataTypeLayout::FixedWidth(sizeof(int32_t))}); |
| } |
| } |
| |
| uint8_t UnionType::max_type_code() const { |
| return type_codes_.size() == 0 |
| ? 0 |
| : *std::max_element(type_codes_.begin(), type_codes_.end()); |
| } |
| |
| std::string UnionType::ToString() const { |
| std::stringstream s; |
| |
| s << name() << "<"; |
| |
| for (size_t i = 0; i < children_.size(); ++i) { |
| if (i) { |
| s << ", "; |
| } |
| s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]); |
| } |
| s << ">"; |
| return s.str(); |
| } |
| |
| SparseUnionType::SparseUnionType(std::vector<std::shared_ptr<Field>> fields, |
| std::vector<int8_t> type_codes) |
| : UnionType(fields, type_codes, Type::SPARSE_UNION) {} |
| |
| Result<std::shared_ptr<DataType>> SparseUnionType::Make( |
| std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) { |
| RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::SPARSE)); |
| return std::make_shared<SparseUnionType>(fields, type_codes); |
| } |
| |
| DenseUnionType::DenseUnionType(std::vector<std::shared_ptr<Field>> fields, |
| std::vector<int8_t> type_codes) |
| : UnionType(fields, type_codes, Type::DENSE_UNION) {} |
| |
| Result<std::shared_ptr<DataType>> DenseUnionType::Make( |
| std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) { |
| RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::DENSE)); |
| return std::make_shared<DenseUnionType>(fields, type_codes); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Struct type |
| |
| namespace { |
| |
| std::unordered_multimap<std::string, int> CreateNameToIndexMap( |
| const std::vector<std::shared_ptr<Field>>& fields) { |
| std::unordered_multimap<std::string, int> name_to_index; |
| for (size_t i = 0; i < fields.size(); ++i) { |
| name_to_index.emplace(fields[i]->name(), static_cast<int>(i)); |
| } |
| return name_to_index; |
| } |
| |
| template <int NotFoundValue = -1, int DuplicateFoundValue = -1> |
| int LookupNameIndex(const std::unordered_multimap<std::string, int>& name_to_index, |
| const std::string& name) { |
| auto p = name_to_index.equal_range(name); |
| auto it = p.first; |
| if (it == p.second) { |
| // Not found |
| return NotFoundValue; |
| } |
| auto index = it->second; |
| if (++it != p.second) { |
| // Duplicate field name |
| return DuplicateFoundValue; |
| } |
| return index; |
| } |
| |
| } // namespace |
| |
| class StructType::Impl { |
| public: |
| explicit Impl(const std::vector<std::shared_ptr<Field>>& fields) |
| : name_to_index_(CreateNameToIndexMap(fields)) {} |
| |
| const std::unordered_multimap<std::string, int> name_to_index_; |
| }; |
| |
| StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields) |
| : NestedType(Type::STRUCT), impl_(new Impl(fields)) { |
| children_ = fields; |
| } |
| |
| StructType::~StructType() {} |
| |
| std::string StructType::ToString() const { |
| std::stringstream s; |
| s << "struct<"; |
| for (int i = 0; i < this->num_fields(); ++i) { |
| if (i > 0) { |
| s << ", "; |
| } |
| std::shared_ptr<Field> field = this->field(i); |
| s << field->ToString(); |
| } |
| s << ">"; |
| return s.str(); |
| } |
| |
| std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const { |
| int i = GetFieldIndex(name); |
| return i == -1 ? nullptr : children_[i]; |
| } |
| |
| int StructType::GetFieldIndex(const std::string& name) const { |
| return LookupNameIndex(impl_->name_to_index_, name); |
| } |
| |
| std::vector<int> StructType::GetAllFieldIndices(const std::string& name) const { |
| std::vector<int> result; |
| auto p = impl_->name_to_index_.equal_range(name); |
| for (auto it = p.first; it != p.second; ++it) { |
| result.push_back(it->second); |
| } |
| if (result.size() > 1) { |
| std::sort(result.begin(), result.end()); |
| } |
| return result; |
| } |
| |
| std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName( |
| const std::string& name) const { |
| std::vector<std::shared_ptr<Field>> result; |
| auto p = impl_->name_to_index_.equal_range(name); |
| for (auto it = p.first; it != p.second; ++it) { |
| result.push_back(children_[it->second]); |
| } |
| return result; |
| } |
| |
| // Taken from the Apache Impala codebase. The comments next |
| // to the return values are the maximum value that can be represented in 2's |
| // complement with the returned number of bytes. |
| int32_t DecimalType::DecimalSize(int32_t precision) { |
| DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got " |
| << precision; |
| |
| // Generated in python with: |
| // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8)) |
| // >>> [-1] + [decimal_size(i) for i in range(1, 77)] |
| constexpr int32_t kBytes[] = { |
| -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9, |
| 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17, |
| 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25, |
| 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32}; |
| |
| if (precision <= 76) { |
| return kBytes[precision]; |
| } |
| return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1)); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Decimal128 type |
| |
| Decimal128Type::Decimal128Type(int32_t precision, int32_t scale) |
| : DecimalType(type_id, 16, precision, scale) { |
| ARROW_CHECK_GE(precision, kMinPrecision); |
| ARROW_CHECK_LE(precision, kMaxPrecision); |
| } |
| |
| Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_t scale) { |
| if (precision < kMinPrecision || precision > kMaxPrecision) { |
| return Status::Invalid("Decimal precision out of range: ", precision); |
| } |
| return std::make_shared<Decimal128Type>(precision, scale); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Decimal256 type |
| |
| Decimal256Type::Decimal256Type(int32_t precision, int32_t scale) |
| : DecimalType(type_id, 32, precision, scale) { |
| ARROW_CHECK_GE(precision, kMinPrecision); |
| ARROW_CHECK_LE(precision, kMaxPrecision); |
| } |
| |
| Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) { |
| if (precision < kMinPrecision || precision > kMaxPrecision) { |
| return Status::Invalid("Decimal precision out of range: ", precision); |
| } |
| return std::make_shared<Decimal256Type>(precision, scale); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Dictionary-encoded type |
| |
| Status DictionaryType::ValidateParameters(const DataType& index_type, |
| const DataType& value_type) { |
| if (!is_integer(index_type.id())) { |
| return Status::TypeError("Dictionary index type should be integer, got ", |
| index_type.ToString()); |
| } |
| return Status::OK(); |
| } |
| |
| int DictionaryType::bit_width() const { |
| return checked_cast<const FixedWidthType&>(*index_type_).bit_width(); |
| } |
| |
| Result<std::shared_ptr<DataType>> DictionaryType::Make( |
| const std::shared_ptr<DataType>& index_type, |
| const std::shared_ptr<DataType>& value_type, bool ordered) { |
| RETURN_NOT_OK(ValidateParameters(*index_type, *value_type)); |
| return std::make_shared<DictionaryType>(index_type, value_type, ordered); |
| } |
| |
| DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type, |
| const std::shared_ptr<DataType>& value_type, bool ordered) |
| : FixedWidthType(Type::DICTIONARY), |
| index_type_(index_type), |
| value_type_(value_type), |
| ordered_(ordered) { |
| ARROW_CHECK_OK(ValidateParameters(*index_type_, *value_type_)); |
| } |
| |
| DataTypeLayout DictionaryType::layout() const { |
| auto layout = index_type_->layout(); |
| layout.has_dictionary = true; |
| return layout; |
| } |
| |
| std::string DictionaryType::ToString() const { |
| std::stringstream ss; |
| ss << this->name() << "<values=" << value_type_->ToString() |
| << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">"; |
| return ss.str(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Null type |
| |
| std::string NullType::ToString() const { return name(); } |
| |
| // ---------------------------------------------------------------------- |
| // FieldRef |
| |
| size_t FieldPath::hash() const { |
| return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int)); |
| } |
| |
| std::string FieldPath::ToString() const { |
| if (this->indices().empty()) { |
| return "FieldPath(empty)"; |
| } |
| |
| std::string repr = "FieldPath("; |
| for (auto index : this->indices()) { |
| repr += std::to_string(index) + " "; |
| } |
| repr.back() = ')'; |
| return repr; |
| } |
| |
| struct FieldPathGetImpl { |
| static const DataType& GetType(const ArrayData& data) { return *data.type; } |
| |
| static void Summarize(const FieldVector& fields, std::stringstream* ss) { |
| *ss << "{ "; |
| for (const auto& field : fields) { |
| *ss << field->ToString() << ", "; |
| } |
| *ss << "}"; |
| } |
| |
| template <typename T> |
| static void Summarize(const std::vector<T>& columns, std::stringstream* ss) { |
| *ss << "{ "; |
| for (const auto& column : columns) { |
| *ss << GetType(*column) << ", "; |
| } |
| *ss << "}"; |
| } |
| |
| template <typename T> |
| static Status IndexError(const FieldPath* path, int out_of_range_depth, |
| const std::vector<T>& children) { |
| std::stringstream ss; |
| ss << "index out of range. "; |
| |
| ss << "indices=[ "; |
| int depth = 0; |
| for (int i : path->indices()) { |
| if (depth != out_of_range_depth) { |
| ss << i << " "; |
| continue; |
| } |
| ss << ">" << i << "< "; |
| ++depth; |
| } |
| ss << "] "; |
| |
| if (std::is_same<T, std::shared_ptr<Field>>::value) { |
| ss << "fields were: "; |
| } else { |
| ss << "columns had types: "; |
| } |
| Summarize(children, &ss); |
| |
| return Status::IndexError(ss.str()); |
| } |
| |
| template <typename T, typename GetChildren> |
| static Result<T> Get(const FieldPath* path, const std::vector<T>* children, |
| GetChildren&& get_children, int* out_of_range_depth) { |
| if (path->indices().empty()) { |
| return Status::Invalid("empty indices cannot be traversed"); |
| } |
| |
| int depth = 0; |
| const T* out; |
| for (int index : path->indices()) { |
| if (children == nullptr) { |
| return Status::NotImplemented("Get child data of non-struct array"); |
| } |
| |
| if (index < 0 || static_cast<size_t>(index) >= children->size()) { |
| *out_of_range_depth = depth; |
| return nullptr; |
| } |
| |
| out = &children->at(index); |
| children = get_children(*out); |
| ++depth; |
| } |
| |
| return *out; |
| } |
| |
| template <typename T, typename GetChildren> |
| static Result<T> Get(const FieldPath* path, const std::vector<T>* children, |
| GetChildren&& get_children) { |
| int out_of_range_depth = -1; |
| ARROW_ASSIGN_OR_RAISE(auto child, |
| Get(path, children, std::forward<GetChildren>(get_children), |
| &out_of_range_depth)); |
| if (child != nullptr) { |
| return std::move(child); |
| } |
| return IndexError(path, out_of_range_depth, *children); |
| } |
| |
| static Result<std::shared_ptr<Field>> Get(const FieldPath* path, |
| const FieldVector& fields) { |
| return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr<Field>& field) { |
| return &field->type()->fields(); |
| }); |
| } |
| |
| static Result<std::shared_ptr<ArrayData>> Get(const FieldPath* path, |
| const ArrayDataVector& child_data) { |
| return FieldPathGetImpl::Get( |
| path, &child_data, |
| [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* { |
| if (data->type->id() != Type::STRUCT) { |
| return nullptr; |
| } |
| return &data->child_data; |
| }); |
| } |
| }; |
| |
| Result<std::shared_ptr<Field>> FieldPath::Get(const Schema& schema) const { |
| return FieldPathGetImpl::Get(this, schema.fields()); |
| } |
| |
| Result<std::shared_ptr<Field>> FieldPath::Get(const Field& field) const { |
| return FieldPathGetImpl::Get(this, field.type()->fields()); |
| } |
| |
| Result<std::shared_ptr<Field>> FieldPath::Get(const DataType& type) const { |
| return FieldPathGetImpl::Get(this, type.fields()); |
| } |
| |
| Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const { |
| return FieldPathGetImpl::Get(this, fields); |
| } |
| |
| Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const { |
| ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data())); |
| return MakeArray(std::move(data)); |
| } |
| |
| Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const { |
| ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data())); |
| return MakeArray(std::move(data)); |
| } |
| |
| Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const { |
| if (data.type->id() != Type::STRUCT) { |
| return Status::NotImplemented("Get child data of non-struct array"); |
| } |
| return FieldPathGetImpl::Get(this, data.child_data); |
| } |
| |
| FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) { |
| DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0); |
| } |
| |
| void FieldRef::Flatten(std::vector<FieldRef> children) { |
| // flatten children |
| struct Visitor { |
| void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); } |
| |
| void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); } |
| |
| void operator()(std::vector<FieldRef>* children) { |
| for (auto& child : *children) { |
| util::visit(*this, &child.impl_); |
| } |
| } |
| |
| std::back_insert_iterator<std::vector<FieldRef>> out; |
| }; |
| |
| std::vector<FieldRef> out; |
| Visitor visitor{std::back_inserter(out)}; |
| visitor(&children); |
| |
| DCHECK(!out.empty()); |
| DCHECK(std::none_of(out.begin(), out.end(), |
| [](const FieldRef& ref) { return ref.IsNested(); })); |
| |
| if (out.size() == 1) { |
| impl_ = std::move(out[0].impl_); |
| } else { |
| impl_ = std::move(out); |
| } |
| } |
| |
| Result<FieldRef> FieldRef::FromDotPath(const std::string& dot_path_arg) { |
| if (dot_path_arg.empty()) { |
| return Status::Invalid("Dot path was empty"); |
| } |
| |
| std::vector<FieldRef> children; |
| |
| util::string_view dot_path = dot_path_arg; |
| |
| auto parse_name = [&] { |
| std::string name; |
| for (;;) { |
| auto segment_end = dot_path.find_first_of("\\[."); |
| if (segment_end == util::string_view::npos) { |
| // dot_path doesn't contain any other special characters; consume all |
| name.append(dot_path.begin(), dot_path.end()); |
| dot_path = ""; |
| break; |
| } |
| |
| if (dot_path[segment_end] != '\\') { |
| // segment_end points to a subscript for a new FieldRef |
| name.append(dot_path.begin(), segment_end); |
| dot_path = dot_path.substr(segment_end); |
| break; |
| } |
| |
| if (dot_path.size() == segment_end + 1) { |
| // dot_path ends with backslash; consume it all |
| name.append(dot_path.begin(), dot_path.end()); |
| dot_path = ""; |
| break; |
| } |
| |
| // append all characters before backslash, then the character which follows it |
| name.append(dot_path.begin(), segment_end); |
| name.push_back(dot_path[segment_end + 1]); |
| dot_path = dot_path.substr(segment_end + 2); |
| } |
| return name; |
| }; |
| |
| while (!dot_path.empty()) { |
| auto subscript = dot_path[0]; |
| dot_path = dot_path.substr(1); |
| switch (subscript) { |
| case '.': { |
| // next element is a name |
| children.emplace_back(parse_name()); |
| continue; |
| } |
| case '[': { |
| auto subscript_end = dot_path.find_first_not_of("0123456789"); |
| if (subscript_end == util::string_view::npos || dot_path[subscript_end] != ']') { |
| return Status::Invalid("Dot path '", dot_path_arg, |
| "' contained an unterminated index"); |
| } |
| children.emplace_back(std::atoi(dot_path.data())); |
| dot_path = dot_path.substr(subscript_end + 1); |
| continue; |
| } |
| default: |
| return Status::Invalid("Dot path must begin with '[' or '.', got '", dot_path_arg, |
| "'"); |
| } |
| } |
| |
| FieldRef out; |
| out.Flatten(std::move(children)); |
| return out; |
| } |
| |
| size_t FieldRef::hash() const { |
| struct Visitor : std::hash<std::string> { |
| using std::hash<std::string>::operator(); |
| |
| size_t operator()(const FieldPath& path) { return path.hash(); } |
| |
| size_t operator()(const std::vector<FieldRef>& children) { |
| size_t hash = 0; |
| |
| for (const FieldRef& child : children) { |
| hash ^= child.hash(); |
| } |
| |
| return hash; |
| } |
| }; |
| |
| return util::visit(Visitor{}, impl_); |
| } |
| |
| std::string FieldRef::ToString() const { |
| struct Visitor { |
| std::string operator()(const FieldPath& path) { return path.ToString(); } |
| |
| std::string operator()(const std::string& name) { return "Name(" + name + ")"; } |
| |
| std::string operator()(const std::vector<FieldRef>& children) { |
| std::string repr = "Nested("; |
| for (const auto& child : children) { |
| repr += child.ToString() + " "; |
| } |
| repr.resize(repr.size() - 1); |
| repr += ")"; |
| return repr; |
| } |
| }; |
| |
| return "FieldRef." + util::visit(Visitor{}, impl_); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const { |
| return FindAll(schema.fields()); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const Field& field) const { |
| return FindAll(field.type()->fields()); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const DataType& type) const { |
| return FindAll(type.fields()); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const { |
| struct Visitor { |
| std::vector<FieldPath> operator()(const FieldPath& path) { |
| // skip long IndexError construction if path is out of range |
| int out_of_range_depth; |
| auto maybe_field = FieldPathGetImpl::Get( |
| &path, &fields_, |
| [](const std::shared_ptr<Field>& field) { return &field->type()->fields(); }, |
| &out_of_range_depth); |
| |
| DCHECK_OK(maybe_field.status()); |
| |
| if (maybe_field.ValueOrDie() != nullptr) { |
| return {path}; |
| } |
| return {}; |
| } |
| |
| std::vector<FieldPath> operator()(const std::string& name) { |
| std::vector<FieldPath> out; |
| |
| for (int i = 0; i < static_cast<int>(fields_.size()); ++i) { |
| if (fields_[i]->name() == name) { |
| out.push_back({i}); |
| } |
| } |
| |
| return out; |
| } |
| |
| struct Matches { |
| // referents[i] is referenced by prefixes[i] |
| std::vector<FieldPath> prefixes; |
| FieldVector referents; |
| |
| Matches(std::vector<FieldPath> matches, const FieldVector& fields) { |
| for (auto& match : matches) { |
| Add({}, std::move(match), fields); |
| } |
| } |
| |
| Matches() = default; |
| |
| size_t size() const { return referents.size(); } |
| |
| void Add(const FieldPath& prefix, const FieldPath& suffix, |
| const FieldVector& fields) { |
| auto maybe_field = suffix.Get(fields); |
| DCHECK_OK(maybe_field.status()); |
| referents.push_back(std::move(maybe_field).ValueOrDie()); |
| |
| std::vector<int> concatenated_indices(prefix.indices().size() + |
| suffix.indices().size()); |
| auto it = concatenated_indices.begin(); |
| for (auto path : {&prefix, &suffix}) { |
| it = std::copy(path->indices().begin(), path->indices().end(), it); |
| } |
| prefixes.emplace_back(std::move(concatenated_indices)); |
| } |
| }; |
| |
| std::vector<FieldPath> operator()(const std::vector<FieldRef>& refs) { |
| DCHECK_GE(refs.size(), 1); |
| Matches matches(refs.front().FindAll(fields_), fields_); |
| |
| for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) { |
| Matches next_matches; |
| for (size_t i = 0; i < matches.size(); ++i) { |
| const auto& referent = *matches.referents[i]; |
| |
| for (const FieldPath& match : ref_it->FindAll(referent)) { |
| next_matches.Add(matches.prefixes[i], match, referent.type()->fields()); |
| } |
| } |
| matches = std::move(next_matches); |
| } |
| |
| return matches.prefixes; |
| } |
| |
| const FieldVector& fields_; |
| }; |
| |
| return util::visit(Visitor{fields}, impl_); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const { |
| return FindAll(*array.type); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const Array& array) const { |
| return FindAll(*array.type()); |
| } |
| |
| std::vector<FieldPath> FieldRef::FindAll(const RecordBatch& batch) const { |
| return FindAll(*batch.schema()); |
| } |
| |
| void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } |
| |
| // ---------------------------------------------------------------------- |
| // Schema implementation |
| |
| std::string EndiannessToString(Endianness endianness) { |
| switch (endianness) { |
| case Endianness::Little: |
| return "little"; |
| case Endianness::Big: |
| return "big"; |
| default: |
| DCHECK(false) << "invalid endianness"; |
| return "???"; |
| } |
| } |
| |
| class Schema::Impl { |
| public: |
| Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness, |
| std::shared_ptr<const KeyValueMetadata> metadata) |
| : fields_(std::move(fields)), |
| endianness_(endianness), |
| name_to_index_(CreateNameToIndexMap(fields_)), |
| metadata_(std::move(metadata)) {} |
| |
| std::vector<std::shared_ptr<Field>> fields_; |
| Endianness endianness_; |
| std::unordered_multimap<std::string, int> name_to_index_; |
| std::shared_ptr<const KeyValueMetadata> metadata_; |
| }; |
| |
| Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness, |
| std::shared_ptr<const KeyValueMetadata> metadata) |
| : detail::Fingerprintable(), |
| impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {} |
| |
| Schema::Schema(std::vector<std::shared_ptr<Field>> fields, |
| std::shared_ptr<const KeyValueMetadata> metadata) |
| : detail::Fingerprintable(), |
| impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {} |
| |
| Schema::Schema(const Schema& schema) |
| : detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {} |
| |
| Schema::~Schema() = default; |
| |
| std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const { |
| return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_); |
| } |
| |
| Endianness Schema::endianness() const { return impl_->endianness_; } |
| |
| bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; } |
| |
| int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); } |
| |
| const std::shared_ptr<Field>& Schema::field(int i) const { |
| DCHECK_GE(i, 0); |
| DCHECK_LT(i, num_fields()); |
| return impl_->fields_[i]; |
| } |
| |
| const std::vector<std::shared_ptr<Field>>& Schema::fields() const { |
| return impl_->fields_; |
| } |
| |
| bool Schema::Equals(const Schema& other, bool check_metadata) const { |
| if (this == &other) { |
| return true; |
| } |
| |
| // checks endianness equality |
| if (endianness() != other.endianness()) { |
| return false; |
| } |
| |
| // checks field equality |
| if (num_fields() != other.num_fields()) { |
| return false; |
| } |
| |
| if (check_metadata) { |
| const auto& metadata_fp = metadata_fingerprint(); |
| const auto& other_metadata_fp = other.metadata_fingerprint(); |
| if (metadata_fp != other_metadata_fp) { |
| return false; |
| } |
| } |
| |
| // Fast path using fingerprints, if possible |
| const auto& fp = fingerprint(); |
| const auto& other_fp = other.fingerprint(); |
| if (!fp.empty() && !other_fp.empty()) { |
| return fp == other_fp; |
| } |
| |
| // Fall back on field-by-field comparison |
| for (int i = 0; i < num_fields(); ++i) { |
| if (!field(i)->Equals(*other.field(i).get(), check_metadata)) { |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| bool Schema::Equals(const std::shared_ptr<Schema>& other, bool check_metadata) const { |
| if (other == nullptr) { |
| return false; |
| } |
| |
| return Equals(*other, check_metadata); |
| } |
| |
| std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const { |
| int i = GetFieldIndex(name); |
| return i == -1 ? nullptr : impl_->fields_[i]; |
| } |
| |
| int Schema::GetFieldIndex(const std::string& name) const { |
| return LookupNameIndex(impl_->name_to_index_, name); |
| } |
| |
| std::vector<int> Schema::GetAllFieldIndices(const std::string& name) const { |
| std::vector<int> result; |
| auto p = impl_->name_to_index_.equal_range(name); |
| for (auto it = p.first; it != p.second; ++it) { |
| result.push_back(it->second); |
| } |
| if (result.size() > 1) { |
| std::sort(result.begin(), result.end()); |
| } |
| return result; |
| } |
| |
| Status Schema::CanReferenceFieldsByNames(const std::vector<std::string>& names) const { |
| for (const auto& name : names) { |
| if (GetFieldByName(name) == nullptr) { |
| return Status::Invalid("Field named '", name, |
| "' not found or not unique in the schema."); |
| } |
| } |
| |
| return Status::OK(); |
| } |
| |
| std::vector<std::shared_ptr<Field>> Schema::GetAllFieldsByName( |
| const std::string& name) const { |
| std::vector<std::shared_ptr<Field>> result; |
| auto p = impl_->name_to_index_.equal_range(name); |
| for (auto it = p.first; it != p.second; ++it) { |
| result.push_back(impl_->fields_[it->second]); |
| } |
| return result; |
| } |
| |
| Result<std::shared_ptr<Schema>> Schema::AddField( |
| int i, const std::shared_ptr<Field>& field) const { |
| if (i < 0 || i > this->num_fields()) { |
| return Status::Invalid("Invalid column index to add field."); |
| } |
| |
| return std::make_shared<Schema>(internal::AddVectorElement(impl_->fields_, i, field), |
| impl_->metadata_); |
| } |
| |
| Result<std::shared_ptr<Schema>> Schema::SetField( |
| int i, const std::shared_ptr<Field>& field) const { |
| if (i < 0 || i > this->num_fields()) { |
| return Status::Invalid("Invalid column index to add field."); |
| } |
| |
| return std::make_shared<Schema>( |
| internal::ReplaceVectorElement(impl_->fields_, i, field), impl_->metadata_); |
| } |
| |
| Result<std::shared_ptr<Schema>> Schema::RemoveField(int i) const { |
| if (i < 0 || i >= this->num_fields()) { |
| return Status::Invalid("Invalid column index to remove field."); |
| } |
| |
| return std::make_shared<Schema>(internal::DeleteVectorElement(impl_->fields_, i), |
| impl_->metadata_); |
| } |
| |
| bool Schema::HasMetadata() const { |
| return (impl_->metadata_ != nullptr) && (impl_->metadata_->size() > 0); |
| } |
| |
| bool Schema::HasDistinctFieldNames() const { |
| auto fields = field_names(); |
| std::unordered_set<std::string> names{fields.cbegin(), fields.cend()}; |
| return names.size() == fields.size(); |
| } |
| |
| std::shared_ptr<Schema> Schema::WithMetadata( |
| const std::shared_ptr<const KeyValueMetadata>& metadata) const { |
| return std::make_shared<Schema>(impl_->fields_, metadata); |
| } |
| |
| const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const { |
| return impl_->metadata_; |
| } |
| |
| std::shared_ptr<Schema> Schema::RemoveMetadata() const { |
| return std::make_shared<Schema>(impl_->fields_); |
| } |
| |
| std::string Schema::ToString(bool show_metadata) const { |
| std::stringstream buffer; |
| |
| int i = 0; |
| for (const auto& field : impl_->fields_) { |
| if (i > 0) { |
| buffer << std::endl; |
| } |
| buffer << field->ToString(show_metadata); |
| ++i; |
| } |
| |
| if (impl_->endianness_ != Endianness::Native) { |
| buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --"; |
| } |
| |
| if (show_metadata && HasMetadata()) { |
| buffer << impl_->metadata_->ToString(); |
| } |
| |
| return buffer.str(); |
| } |
| |
| std::vector<std::string> Schema::field_names() const { |
| std::vector<std::string> names; |
| for (const auto& field : impl_->fields_) { |
| names.push_back(field->name()); |
| } |
| return names; |
| } |
| |
| class SchemaBuilder::Impl { |
| public: |
| friend class SchemaBuilder; |
| Impl(ConflictPolicy policy, Field::MergeOptions field_merge_options) |
| : policy_(policy), field_merge_options_(field_merge_options) {} |
| |
| Impl(std::vector<std::shared_ptr<Field>> fields, |
| std::shared_ptr<const KeyValueMetadata> metadata, ConflictPolicy conflict_policy, |
| Field::MergeOptions field_merge_options) |
| : fields_(std::move(fields)), |
| name_to_index_(CreateNameToIndexMap(fields_)), |
| metadata_(std::move(metadata)), |
| policy_(conflict_policy), |
| field_merge_options_(field_merge_options) {} |
| |
| Status AddField(const std::shared_ptr<Field>& field) { |
| DCHECK_NE(field, nullptr); |
| |
| // Short-circuit, no lookup needed. |
| if (policy_ == CONFLICT_APPEND) { |
| return AppendField(field); |
| } |
| |
| auto name = field->name(); |
| constexpr int kNotFound = -1; |
| constexpr int kDuplicateFound = -2; |
| auto i = LookupNameIndex<kNotFound, kDuplicateFound>(name_to_index_, name); |
| |
| if (i == kNotFound) { |
| return AppendField(field); |
| } |
| |
| // From this point, there's one or more field in the builder that exists with |
| // the same name. |
| |
| if (policy_ == CONFLICT_IGNORE) { |
| // The ignore policy is more generous when there's duplicate in the builder. |
| return Status::OK(); |
| } else if (policy_ == CONFLICT_ERROR) { |
| return Status::Invalid("Duplicate found, policy dictate to treat as an error"); |
| } |
| |
| if (i == kDuplicateFound) { |
| // Cannot merge/replace when there's more than one field in the builder |
| // because we can't decide which to merge/replace. |
| return Status::Invalid("Cannot merge field ", name, |
| " more than one field with same name exists"); |
| } |
| |
| DCHECK_GE(i, 0); |
| |
| if (policy_ == CONFLICT_REPLACE) { |
| fields_[i] = field; |
| } else if (policy_ == CONFLICT_MERGE) { |
| ARROW_ASSIGN_OR_RAISE(fields_[i], fields_[i]->MergeWith(field)); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status AppendField(const std::shared_ptr<Field>& field) { |
| name_to_index_.emplace(field->name(), static_cast<int>(fields_.size())); |
| fields_.push_back(field); |
| return Status::OK(); |
| } |
| |
| void Reset() { |
| fields_.clear(); |
| name_to_index_.clear(); |
| metadata_.reset(); |
| } |
| |
| private: |
| std::vector<std::shared_ptr<Field>> fields_; |
| std::unordered_multimap<std::string, int> name_to_index_; |
| std::shared_ptr<const KeyValueMetadata> metadata_; |
| ConflictPolicy policy_; |
| Field::MergeOptions field_merge_options_; |
| }; |
| |
| SchemaBuilder::SchemaBuilder(ConflictPolicy policy, |
| Field::MergeOptions field_merge_options) { |
| impl_ = internal::make_unique<Impl>(policy, field_merge_options); |
| } |
| |
| SchemaBuilder::SchemaBuilder(std::vector<std::shared_ptr<Field>> fields, |
| ConflictPolicy policy, |
| Field::MergeOptions field_merge_options) { |
| impl_ = internal::make_unique<Impl>(std::move(fields), nullptr, policy, |
| field_merge_options); |
| } |
| |
| SchemaBuilder::SchemaBuilder(const std::shared_ptr<Schema>& schema, ConflictPolicy policy, |
| Field::MergeOptions field_merge_options) { |
| std::shared_ptr<const KeyValueMetadata> metadata; |
| if (schema->HasMetadata()) { |
| metadata = schema->metadata()->Copy(); |
| } |
| |
| impl_ = internal::make_unique<Impl>(schema->fields(), std::move(metadata), policy, |
| field_merge_options); |
| } |
| |
| SchemaBuilder::~SchemaBuilder() {} |
| |
| SchemaBuilder::ConflictPolicy SchemaBuilder::policy() const { return impl_->policy_; } |
| |
| void SchemaBuilder::SetPolicy(SchemaBuilder::ConflictPolicy resolution) { |
| impl_->policy_ = resolution; |
| } |
| |
| Status SchemaBuilder::AddField(const std::shared_ptr<Field>& field) { |
| return impl_->AddField(field); |
| } |
| |
| Status SchemaBuilder::AddFields(const std::vector<std::shared_ptr<Field>>& fields) { |
| for (const auto& field : fields) { |
| RETURN_NOT_OK(AddField(field)); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status SchemaBuilder::AddSchema(const std::shared_ptr<Schema>& schema) { |
| DCHECK_NE(schema, nullptr); |
| return AddFields(schema->fields()); |
| } |
| |
| Status SchemaBuilder::AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas) { |
| for (const auto& schema : schemas) { |
| RETURN_NOT_OK(AddSchema(schema)); |
| } |
| |
| return Status::OK(); |
| } |
| |
| Status SchemaBuilder::AddMetadata(const KeyValueMetadata& metadata) { |
| impl_->metadata_ = metadata.Copy(); |
| return Status::OK(); |
| } |
| |
| Result<std::shared_ptr<Schema>> SchemaBuilder::Finish() const { |
| return schema(impl_->fields_, impl_->metadata_); |
| } |
| |
| void SchemaBuilder::Reset() { impl_->Reset(); } |
| |
| Result<std::shared_ptr<Schema>> SchemaBuilder::Merge( |
| const std::vector<std::shared_ptr<Schema>>& schemas, ConflictPolicy policy) { |
| SchemaBuilder builder{policy}; |
| RETURN_NOT_OK(builder.AddSchemas(schemas)); |
| return builder.Finish(); |
| } |
| |
| Status SchemaBuilder::AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas, |
| ConflictPolicy policy) { |
| return Merge(schemas, policy).status(); |
| } |
| |
| std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields, |
| std::shared_ptr<const KeyValueMetadata> metadata) { |
| return std::make_shared<Schema>(std::move(fields), std::move(metadata)); |
| } |
| |
| std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields, |
| Endianness endianness, |
| std::shared_ptr<const KeyValueMetadata> metadata) { |
| return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata)); |
| } |
| |
| Result<std::shared_ptr<Schema>> UnifySchemas( |
| const std::vector<std::shared_ptr<Schema>>& schemas, |
| const Field::MergeOptions field_merge_options) { |
| if (schemas.empty()) { |
| return Status::Invalid("Must provide at least one schema to unify."); |
| } |
| |
| if (!schemas[0]->HasDistinctFieldNames()) { |
| return Status::Invalid("Can't unify schema with duplicate field names."); |
| } |
| |
| SchemaBuilder builder{schemas[0], SchemaBuilder::CONFLICT_MERGE, field_merge_options}; |
| |
| for (size_t i = 1; i < schemas.size(); i++) { |
| const auto& schema = schemas[i]; |
| if (!schema->HasDistinctFieldNames()) { |
| return Status::Invalid("Can't unify schema with duplicate field names."); |
| } |
| RETURN_NOT_OK(builder.AddSchema(schema)); |
| } |
| |
| return builder.Finish(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Fingerprint computations |
| |
| namespace detail { |
| |
| Fingerprintable::~Fingerprintable() { |
| delete fingerprint_.load(); |
| delete metadata_fingerprint_.load(); |
| } |
| |
| template <typename ComputeFingerprint> |
| static const std::string& LoadFingerprint(std::atomic<std::string*>* fingerprint, |
| ComputeFingerprint&& compute_fingerprint) { |
| auto new_p = new std::string(std::forward<ComputeFingerprint>(compute_fingerprint)()); |
| // Since fingerprint() and metadata_fingerprint() return a *reference* to the |
| // allocated string, the first allocation ever should never be replaced by another |
| // one. Hence the compare_exchange_strong() against nullptr. |
| std::string* expected = nullptr; |
| if (fingerprint->compare_exchange_strong(expected, new_p)) { |
| return *new_p; |
| } else { |
| delete new_p; |
| DCHECK_NE(expected, nullptr); |
| return *expected; |
| } |
| } |
| |
| const std::string& Fingerprintable::LoadFingerprintSlow() const { |
| return LoadFingerprint(&fingerprint_, [this]() { return ComputeFingerprint(); }); |
| } |
| |
| const std::string& Fingerprintable::LoadMetadataFingerprintSlow() const { |
| return LoadFingerprint(&metadata_fingerprint_, |
| [this]() { return ComputeMetadataFingerprint(); }); |
| } |
| |
| } // namespace detail |
| |
| static inline std::string TypeIdFingerprint(const DataType& type) { |
| auto c = static_cast<int>(type.id()) + 'A'; |
| DCHECK_GE(c, 0); |
| DCHECK_LT(c, 128); // Unlikely to happen any soon |
| // Prefix with an unusual character in order to disambiguate |
| std::string s{'@', static_cast<char>(c)}; |
| return s; |
| } |
| |
| static char TimeUnitFingerprint(TimeUnit::type unit) { |
| switch (unit) { |
| case TimeUnit::SECOND: |
| return 's'; |
| case TimeUnit::MILLI: |
| return 'm'; |
| case TimeUnit::MICRO: |
| return 'u'; |
| case TimeUnit::NANO: |
| return 'n'; |
| default: |
| DCHECK(false) << "Unexpected TimeUnit"; |
| return '\0'; |
| } |
| } |
| |
| static char IntervalTypeFingerprint(IntervalType::type unit) { |
| switch (unit) { |
| case IntervalType::DAY_TIME: |
| return 'd'; |
| case IntervalType::MONTHS: |
| return 'M'; |
| default: |
| DCHECK(false) << "Unexpected IntervalType::type"; |
| return '\0'; |
| } |
| } |
| |
| static void AppendMetadataFingerprint(const KeyValueMetadata& metadata, |
| std::stringstream* ss) { |
| // Compute metadata fingerprint. KeyValueMetadata is not immutable, |
| // so we don't cache the result on the metadata instance. |
| const auto pairs = metadata.sorted_pairs(); |
| if (!pairs.empty()) { |
| *ss << "!{"; |
| for (const auto& p : pairs) { |
| const auto& k = p.first; |
| const auto& v = p.second; |
| // Since metadata strings can contain arbitrary characters, prefix with |
| // string length to disambiguate. |
| *ss << k.length() << ':' << k << ':'; |
| *ss << v.length() << ':' << v << ';'; |
| } |
| *ss << '}'; |
| } |
| } |
| |
| std::string Field::ComputeFingerprint() const { |
| const auto& type_fingerprint = type_->fingerprint(); |
| if (type_fingerprint.empty()) { |
| // Underlying DataType doesn't support fingerprinting. |
| return ""; |
| } |
| std::stringstream ss; |
| ss << 'F'; |
| if (nullable_) { |
| ss << 'n'; |
| } else { |
| ss << 'N'; |
| } |
| ss << name_; |
| ss << '{' << type_fingerprint << '}'; |
| return ss.str(); |
| } |
| |
| std::string Field::ComputeMetadataFingerprint() const { |
| std::stringstream ss; |
| if (metadata_) { |
| AppendMetadataFingerprint(*metadata_, &ss); |
| } |
| const auto& type_fingerprint = type_->metadata_fingerprint(); |
| if (!type_fingerprint.empty()) { |
| ss << "+{" << type_->metadata_fingerprint() << "}"; |
| } |
| return ss.str(); |
| } |
| |
| std::string Schema::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << "S{"; |
| for (const auto& field : fields()) { |
| const auto& field_fingerprint = field->fingerprint(); |
| if (field_fingerprint.empty()) { |
| return ""; |
| } |
| ss << field_fingerprint << ";"; |
| } |
| ss << (endianness() == Endianness::Little ? "L" : "B"); |
| ss << "}"; |
| return ss.str(); |
| } |
| |
| std::string Schema::ComputeMetadataFingerprint() const { |
| std::stringstream ss; |
| if (HasMetadata()) { |
| AppendMetadataFingerprint(*metadata(), &ss); |
| } |
| ss << "S{"; |
| for (const auto& field : fields()) { |
| const auto& field_fingerprint = field->metadata_fingerprint(); |
| ss << field_fingerprint << ";"; |
| } |
| ss << "}"; |
| return ss.str(); |
| } |
| |
| void PrintTo(const Schema& s, std::ostream* os) { *os << s; } |
| |
| std::string DataType::ComputeFingerprint() const { |
| // Default implementation returns empty string, signalling non-implemented |
| // functionality. |
| return ""; |
| } |
| |
| std::string DataType::ComputeMetadataFingerprint() const { |
| // Whatever the data type, metadata can only be found on child fields |
| std::string s; |
| for (const auto& child : children_) { |
| s += child->metadata_fingerprint() + ";"; |
| } |
| return s; |
| } |
| |
| #define PARAMETER_LESS_FINGERPRINT(TYPE_CLASS) \ |
| std::string TYPE_CLASS##Type::ComputeFingerprint() const { \ |
| return TypeIdFingerprint(*this); \ |
| } |
| |
| PARAMETER_LESS_FINGERPRINT(Null) |
| PARAMETER_LESS_FINGERPRINT(Boolean) |
| PARAMETER_LESS_FINGERPRINT(Int8) |
| PARAMETER_LESS_FINGERPRINT(Int16) |
| PARAMETER_LESS_FINGERPRINT(Int32) |
| PARAMETER_LESS_FINGERPRINT(Int64) |
| PARAMETER_LESS_FINGERPRINT(UInt8) |
| PARAMETER_LESS_FINGERPRINT(UInt16) |
| PARAMETER_LESS_FINGERPRINT(UInt32) |
| PARAMETER_LESS_FINGERPRINT(UInt64) |
| PARAMETER_LESS_FINGERPRINT(HalfFloat) |
| PARAMETER_LESS_FINGERPRINT(Float) |
| PARAMETER_LESS_FINGERPRINT(Double) |
| PARAMETER_LESS_FINGERPRINT(Binary) |
| PARAMETER_LESS_FINGERPRINT(LargeBinary) |
| PARAMETER_LESS_FINGERPRINT(String) |
| PARAMETER_LESS_FINGERPRINT(LargeString) |
| PARAMETER_LESS_FINGERPRINT(Date32) |
| PARAMETER_LESS_FINGERPRINT(Date64) |
| |
| #undef PARAMETER_LESS_FINGERPRINT |
| |
| std::string DictionaryType::ComputeFingerprint() const { |
| const auto& index_fingerprint = index_type_->fingerprint(); |
| const auto& value_fingerprint = value_type_->fingerprint(); |
| std::string ordered_fingerprint = ordered_ ? "1" : "0"; |
| |
| DCHECK(!index_fingerprint.empty()); // it's an integer type |
| if (!value_fingerprint.empty()) { |
| return TypeIdFingerprint(*this) + index_fingerprint + value_fingerprint + |
| ordered_fingerprint; |
| } |
| return ordered_fingerprint; |
| } |
| |
| std::string ListType::ComputeFingerprint() const { |
| const auto& child_fingerprint = children_[0]->fingerprint(); |
| if (!child_fingerprint.empty()) { |
| return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}"; |
| } |
| return ""; |
| } |
| |
| std::string LargeListType::ComputeFingerprint() const { |
| const auto& child_fingerprint = children_[0]->fingerprint(); |
| if (!child_fingerprint.empty()) { |
| return TypeIdFingerprint(*this) + "{" + child_fingerprint + "}"; |
| } |
| return ""; |
| } |
| |
| std::string MapType::ComputeFingerprint() const { |
| const auto& key_fingerprint = key_type()->fingerprint(); |
| const auto& item_fingerprint = item_type()->fingerprint(); |
| if (!key_fingerprint.empty() && !item_fingerprint.empty()) { |
| if (keys_sorted_) { |
| return TypeIdFingerprint(*this) + "s{" + key_fingerprint + item_fingerprint + "}"; |
| } else { |
| return TypeIdFingerprint(*this) + "{" + key_fingerprint + item_fingerprint + "}"; |
| } |
| } |
| return ""; |
| } |
| |
| std::string FixedSizeListType::ComputeFingerprint() const { |
| const auto& child_fingerprint = children_[0]->fingerprint(); |
| if (!child_fingerprint.empty()) { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << "[" << list_size_ << "]" |
| << "{" << child_fingerprint << "}"; |
| return ss.str(); |
| } |
| return ""; |
| } |
| |
| std::string FixedSizeBinaryType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "]"; |
| return ss.str(); |
| } |
| |
| std::string DecimalType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << "[" << byte_width_ << "," << precision_ << "," |
| << scale_ << "]"; |
| return ss.str(); |
| } |
| |
| std::string StructType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << "{"; |
| for (const auto& child : children_) { |
| const auto& child_fingerprint = child->fingerprint(); |
| if (child_fingerprint.empty()) { |
| return ""; |
| } |
| ss << child_fingerprint << ";"; |
| } |
| ss << "}"; |
| return ss.str(); |
| } |
| |
| std::string UnionType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this); |
| switch (mode()) { |
| case UnionMode::SPARSE: |
| ss << "[s"; |
| break; |
| case UnionMode::DENSE: |
| ss << "[d"; |
| break; |
| default: |
| DCHECK(false) << "Unexpected UnionMode"; |
| } |
| for (const auto code : type_codes_) { |
| // Represent code as integer, not raw character |
| ss << ':' << static_cast<int32_t>(code); |
| } |
| ss << "]{"; |
| for (const auto& child : children_) { |
| const auto& child_fingerprint = child->fingerprint(); |
| if (child_fingerprint.empty()) { |
| return ""; |
| } |
| ss << child_fingerprint << ";"; |
| } |
| ss << "}"; |
| return ss.str(); |
| } |
| |
| std::string TimeType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_); |
| return ss.str(); |
| } |
| |
| std::string TimestampType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_) << timezone_.length() |
| << ':' << timezone_; |
| return ss.str(); |
| } |
| |
| std::string IntervalType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << IntervalTypeFingerprint(interval_type()); |
| return ss.str(); |
| } |
| |
| std::string DurationType::ComputeFingerprint() const { |
| std::stringstream ss; |
| ss << TypeIdFingerprint(*this) << TimeUnitFingerprint(unit_); |
| return ss.str(); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // Visitors and factory functions |
| |
| Status DataType::Accept(TypeVisitor* visitor) const { |
| return VisitTypeInline(*this, visitor); |
| } |
| |
| #define TYPE_FACTORY(NAME, KLASS) \ |
| std::shared_ptr<DataType> NAME() { \ |
| static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \ |
| return result; \ |
| } |
| |
| TYPE_FACTORY(null, NullType) |
| TYPE_FACTORY(boolean, BooleanType) |
| TYPE_FACTORY(int8, Int8Type) |
| TYPE_FACTORY(uint8, UInt8Type) |
| TYPE_FACTORY(int16, Int16Type) |
| TYPE_FACTORY(uint16, UInt16Type) |
| TYPE_FACTORY(int32, Int32Type) |
| TYPE_FACTORY(uint32, UInt32Type) |
| TYPE_FACTORY(int64, Int64Type) |
| TYPE_FACTORY(uint64, UInt64Type) |
| TYPE_FACTORY(float16, HalfFloatType) |
| TYPE_FACTORY(float32, FloatType) |
| TYPE_FACTORY(float64, DoubleType) |
| TYPE_FACTORY(utf8, StringType) |
| TYPE_FACTORY(large_utf8, LargeStringType) |
| TYPE_FACTORY(binary, BinaryType) |
| TYPE_FACTORY(large_binary, LargeBinaryType) |
| TYPE_FACTORY(date64, Date64Type) |
| TYPE_FACTORY(date32, Date32Type) |
| |
| std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) { |
| return std::make_shared<FixedSizeBinaryType>(byte_width); |
| } |
| |
| std::shared_ptr<DataType> duration(TimeUnit::type unit) { |
| return std::make_shared<DurationType>(unit); |
| } |
| |
| std::shared_ptr<DataType> day_time_interval() { |
| return std::make_shared<DayTimeIntervalType>(); |
| } |
| |
| std::shared_ptr<DataType> month_interval() { |
| return std::make_shared<MonthIntervalType>(); |
| } |
| |
| std::shared_ptr<DataType> timestamp(TimeUnit::type unit) { |
| return std::make_shared<TimestampType>(unit); |
| } |
| |
| std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) { |
| return std::make_shared<TimestampType>(unit, timezone); |
| } |
| |
| std::shared_ptr<DataType> time32(TimeUnit::type unit) { |
| return std::make_shared<Time32Type>(unit); |
| } |
| |
| std::shared_ptr<DataType> time64(TimeUnit::type unit) { |
| return std::make_shared<Time64Type>(unit); |
| } |
| |
| std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) { |
| return std::make_shared<ListType>(value_type); |
| } |
| |
| std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) { |
| return std::make_shared<ListType>(value_field); |
| } |
| |
| std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type) { |
| return std::make_shared<LargeListType>(value_type); |
| } |
| |
| std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_field) { |
| return std::make_shared<LargeListType>(value_field); |
| } |
| |
| std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type, |
| std::shared_ptr<DataType> item_type, bool keys_sorted) { |
| return std::make_shared<MapType>(std::move(key_type), std::move(item_type), |
| keys_sorted); |
| } |
| |
| std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type, |
| std::shared_ptr<Field> item_field, bool keys_sorted) { |
| return std::make_shared<MapType>(std::move(key_type), std::move(item_field), |
| keys_sorted); |
| } |
| |
| std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type, |
| int32_t list_size) { |
| return std::make_shared<FixedSizeListType>(value_type, list_size); |
| } |
| |
| std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<Field>& value_field, |
| int32_t list_size) { |
| return std::make_shared<FixedSizeListType>(value_field, list_size); |
| } |
| |
| std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) { |
| return std::make_shared<StructType>(fields); |
| } |
| |
| std::shared_ptr<DataType> sparse_union(FieldVector child_fields, |
| std::vector<int8_t> type_codes) { |
| if (type_codes.empty()) { |
| type_codes = internal::Iota(static_cast<int8_t>(child_fields.size())); |
| } |
| return std::make_shared<SparseUnionType>(std::move(child_fields), |
| std::move(type_codes)); |
| } |
| std::shared_ptr<DataType> dense_union(FieldVector child_fields, |
| std::vector<int8_t> type_codes) { |
| if (type_codes.empty()) { |
| type_codes = internal::Iota(static_cast<int8_t>(child_fields.size())); |
| } |
| return std::make_shared<DenseUnionType>(std::move(child_fields), std::move(type_codes)); |
| } |
| |
| FieldVector FieldsFromArraysAndNames(std::vector<std::string> names, |
| const ArrayVector& arrays) { |
| FieldVector fields(arrays.size()); |
| int i = 0; |
| if (names.empty()) { |
| for (const auto& array : arrays) { |
| fields[i] = field(std::to_string(i), array->type()); |
| ++i; |
| } |
| } else { |
| DCHECK_EQ(names.size(), arrays.size()); |
| for (const auto& array : arrays) { |
| fields[i] = field(std::move(names[i]), array->type()); |
| ++i; |
| } |
| } |
| return fields; |
| } |
| |
| std::shared_ptr<DataType> sparse_union(const ArrayVector& children, |
| std::vector<std::string> field_names, |
| std::vector<int8_t> type_codes) { |
| if (type_codes.empty()) { |
| type_codes = internal::Iota(static_cast<int8_t>(children.size())); |
| } |
| auto fields = FieldsFromArraysAndNames(std::move(field_names), children); |
| return sparse_union(std::move(fields), std::move(type_codes)); |
| } |
| |
| std::shared_ptr<DataType> dense_union(const ArrayVector& children, |
| std::vector<std::string> field_names, |
| std::vector<int8_t> type_codes) { |
| if (type_codes.empty()) { |
| type_codes = internal::Iota(static_cast<int8_t>(children.size())); |
| } |
| auto fields = FieldsFromArraysAndNames(std::move(field_names), children); |
| return dense_union(std::move(fields), std::move(type_codes)); |
| } |
| |
| std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type, |
| const std::shared_ptr<DataType>& dict_type, |
| bool ordered) { |
| return std::make_shared<DictionaryType>(index_type, dict_type, ordered); |
| } |
| |
| std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type, |
| bool nullable, |
| std::shared_ptr<const KeyValueMetadata> metadata) { |
| return std::make_shared<Field>(std::move(name), std::move(type), nullable, |
| std::move(metadata)); |
| } |
| |
| std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type, |
| std::shared_ptr<const KeyValueMetadata> metadata) { |
| return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true, |
| std::move(metadata)); |
| } |
| |
| std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) { |
| return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale) |
| : decimal256(precision, scale); |
| } |
| |
| std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) { |
| return std::make_shared<Decimal128Type>(precision, scale); |
| } |
| |
| std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) { |
| return std::make_shared<Decimal256Type>(precision, scale); |
| } |
| |
| std::string Decimal128Type::ToString() const { |
| std::stringstream s; |
| s << "decimal128(" << precision_ << ", " << scale_ << ")"; |
| return s.str(); |
| } |
| |
| std::string Decimal256Type::ToString() const { |
| std::stringstream s; |
| s << "decimal256(" << precision_ << ", " << scale_ << ")"; |
| return s.str(); |
| } |
| |
| } // namespace arrow |