| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <algorithm> |
| #include <iostream> |
| #include <limits> |
| #include <sstream> |
| #include <string> |
| #include <unordered_map> |
| |
| #include <nlohmann/json.hpp> |
| |
| #include "nanoarrow.hpp" |
| |
| #ifndef NANOARROW_TESTING_HPP_INCLUDED |
| #define NANOARROW_TESTING_HPP_INCLUDED |
| |
| /// \defgroup nanoarrow_testing Nanoarrow Testing Helpers |
| /// |
| /// Utilities for testing nanoarrow structures and functions. |
| |
| namespace nanoarrow { |
| |
| namespace testing { |
| |
| namespace internal { |
| |
| // Internal representation of the various structures needed to import and/or export |
| // a dictionary array. We use a serialized version of the dictionary value because |
| // nanoarrow doesn't currently have the ability to copy or reference count an Array. |
| struct Dictionary { |
| nanoarrow::UniqueSchema schema; |
| int64_t column_length; |
| std::string column_json; |
| }; |
| |
| class DictionaryContext { |
| public: |
| DictionaryContext() : next_id_(0) {} |
| |
| ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema* values_schema) { |
| if (!HasDictionaryForId(dictionary_id)) { |
| dictionaries_[dictionary_id] = internal::Dictionary(); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowSchemaDeepCopy(values_schema, dictionaries_[dictionary_id].schema.get())); |
| } |
| |
| dictionary_ids_[values_schema] = dictionary_id; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t* dictionary_id) { |
| while (HasDictionaryForId(next_id_)) { |
| next_id_++; |
| } |
| |
| NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema)); |
| *dictionary_id = next_id_++; |
| return NANOARROW_OK; |
| } |
| |
| void RecordArray(int32_t dictionary_id, int64_t length, std::string column_json) { |
| dictionaries_[dictionary_id].column_length = length; |
| dictionaries_[dictionary_id].column_json = std::move(column_json); |
| } |
| |
| void RecordArray(const ArrowSchema* values_schema, int64_t length, |
| std::string column_json) { |
| auto ids_it = dictionary_ids_.find(values_schema); |
| RecordArray(ids_it->second, length, column_json); |
| } |
| |
| bool empty() { return dictionaries_.empty(); } |
| |
| void clear() { |
| dictionaries_.clear(); |
| dictionary_ids_.clear(); |
| next_id_ = 0; |
| } |
| |
| bool HasDictionaryForSchema(const ArrowSchema* values_schema) const { |
| return dictionary_ids_.find(values_schema) != dictionary_ids_.end(); |
| } |
| |
| bool HasDictionaryForId(int32_t dictionary_id) const { |
| return dictionaries_.find(dictionary_id) != dictionaries_.end(); |
| } |
| |
| const Dictionary& Get(int32_t dictionary_id) const { |
| auto dict_it = dictionaries_.find(dictionary_id); |
| return dict_it->second; |
| } |
| |
| const Dictionary& Get(const ArrowSchema* values_schema) const { |
| auto ids_it = dictionary_ids_.find(values_schema); |
| return Get(ids_it->second); |
| } |
| |
| const std::vector<int32_t> GetAllIds() const { |
| std::vector<int32_t> out; |
| out.reserve(dictionaries_.size()); |
| for (const auto& value : dictionaries_) { |
| out.push_back(value.first); |
| } |
| return out; |
| } |
| |
| private: |
| int32_t next_id_; |
| std::unordered_map<int32_t, Dictionary> dictionaries_; |
| std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_; |
| }; |
| |
| } // namespace internal |
| |
| /// \defgroup nanoarrow_testing-json Integration test helpers |
| /// |
| /// See testing format documentation for details of the JSON representation. This |
| /// representation is not canonical but can be used to implement integration tests with |
| /// other implementations. |
| /// |
| /// @{ |
| |
| /// \brief Writer for the Arrow integration testing JSON format |
| class TestingJSONWriter { |
| public: |
| TestingJSONWriter() : float_precision_(-1), include_metadata_(true) {} |
| |
| /// \brief Set the floating point precision of the writer |
| /// |
| /// The floating point precision by default is -1, which uses the JSON serializer |
| /// to encode the value in the output. When writing files specifically for |
| /// integration tests, floating point values should be rounded to 3 decimal places to |
| /// avoid serialization issues. |
| void set_float_precision(int value) { float_precision_ = value; } |
| |
| /// \brief Set whether metadata should be included in the output of a schema or field |
| /// |
| /// Use false to skip writing schema/field metadata in the output. |
| void set_include_metadata(bool value) { include_metadata_ = value; } |
| |
| void ResetDictionaries() { dictionaries_.clear(); } |
| |
| /// \brief Write an ArrowArrayStream as a data file JSON object to out |
| /// |
| /// Creates output like `{"schema": {...}, "batches": [...], ...}`. |
| ArrowErrorCode WriteDataFile(std::ostream& out, ArrowArrayStream* stream) { |
| if (stream == nullptr || stream->release == nullptr) { |
| return EINVAL; |
| } |
| |
| ResetDictionaries(); |
| |
| out << R"({"schema": )"; |
| |
| nanoarrow::UniqueSchema schema; |
| NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetSchema(stream, schema.get(), nullptr)); |
| NANOARROW_RETURN_NOT_OK(WriteSchema(out, schema.get())); |
| |
| nanoarrow::UniqueArrayView array_view; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr)); |
| |
| out << R"(, "batches": [)"; |
| |
| nanoarrow::UniqueArray array; |
| std::string sep; |
| do { |
| NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(stream, array.get(), nullptr)); |
| if (array->release == nullptr) { |
| break; |
| } |
| |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr)); |
| |
| out << sep; |
| sep = ", "; |
| NANOARROW_RETURN_NOT_OK(WriteBatch(out, schema.get(), array_view.get())); |
| array.reset(); |
| } while (true); |
| |
| out << "]"; |
| |
| if (!dictionaries_.empty()) { |
| out << R"(, "dictionaries": )"; |
| NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out)); |
| } |
| |
| out << "}"; |
| |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write a schema to out |
| /// |
| /// Creates output like `{"fields": [...], "metadata": [...]}`. |
| ArrowErrorCode WriteSchema(std::ostream& out, const ArrowSchema* schema) { |
| // Make sure we have a struct |
| if (std::string(schema->format) != "+s") { |
| return EINVAL; |
| } |
| |
| out << "{"; |
| |
| // Write fields |
| out << R"("fields": )"; |
| if (schema->n_children == 0) { |
| out << "[]"; |
| } else { |
| out << "["; |
| NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[0])); |
| for (int64_t i = 1; i < schema->n_children; i++) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[i])); |
| } |
| out << "]"; |
| } |
| |
| // Write metadata |
| if (ShouldWriteMetadata(schema->metadata)) { |
| out << R"(, "metadata": )"; |
| NANOARROW_RETURN_NOT_OK(WriteMetadata(out, schema->metadata)); |
| } |
| |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write a field to out |
| /// |
| /// Creates output like `{"name" : "col", "type": {...}, ...}` |
| ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) { |
| ArrowSchemaView view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, field, nullptr)); |
| |
| out << "{"; |
| |
| // Write schema->name (may be null) |
| if (field->name == nullptr) { |
| out << R"("name": null)"; |
| } else { |
| out << R"("name": )"; |
| WriteString(out, ArrowCharView(field->name)); |
| } |
| |
| // Write nullability |
| if (field->flags & ARROW_FLAG_NULLABLE) { |
| out << R"(, "nullable": true)"; |
| } else { |
| out << R"(, "nullable": false)"; |
| } |
| |
| // For dictionary encoding, write type as the dictionary (values) type, |
| // record the dictionary schema, and write the "dictionary" member |
| if (field->dictionary != nullptr) { |
| ArrowSchemaView dictionary_view; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowSchemaViewInit(&dictionary_view, field->dictionary, nullptr)); |
| |
| out << R"(, "type": )"; |
| NANOARROW_RETURN_NOT_OK(WriteType(out, &dictionary_view)); |
| |
| int32_t dictionary_id; |
| NANOARROW_RETURN_NOT_OK( |
| dictionaries_.RecordSchema(field->dictionary, &dictionary_id)); |
| |
| out << R"(, "dictionary": )"; |
| view.type = view.storage_type; |
| NANOARROW_RETURN_NOT_OK(WriteFieldDictionary( |
| out, dictionary_id, field->flags & ARROW_FLAG_DICTIONARY_ORDERED, &view)); |
| |
| // Write dictionary children |
| out << R"(, "children": )"; |
| NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field->dictionary)); |
| } else { |
| // Write non-dictionary type/children |
| out << R"(, "type": )"; |
| NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); |
| |
| // Write children |
| out << R"(, "children": )"; |
| NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field)); |
| } |
| |
| // Write metadata |
| if (ShouldWriteMetadata(field->metadata)) { |
| out << R"(, "metadata": )"; |
| NANOARROW_RETURN_NOT_OK(WriteMetadata(out, field->metadata)); |
| } |
| |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write the type portion of a field |
| /// |
| /// Creates output like `{"name": "int", ...}` |
| ArrowErrorCode WriteType(std::ostream& out, const ArrowSchema* field) { |
| ArrowSchemaView view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr)); |
| NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write the metadata portion of a field |
| /// |
| /// Creates output like `[{"key": "...", "value": "..."}, ...]`. |
| ArrowErrorCode WriteMetadata(std::ostream& out, const char* metadata) { |
| if (metadata == nullptr) { |
| out << "null"; |
| return NANOARROW_OK; |
| } |
| |
| ArrowMetadataReader reader; |
| NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); |
| if (reader.remaining_keys == 0) { |
| out << "[]"; |
| return NANOARROW_OK; |
| } |
| |
| out << "["; |
| NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); |
| while (reader.remaining_keys > 0) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); |
| } |
| |
| out << "]"; |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write a "batch" to out |
| /// |
| /// Creates output like `{"count": 123, "columns": [...]}`. |
| ArrowErrorCode WriteBatch(std::ostream& out, const ArrowSchema* schema, |
| const ArrowArrayView* value) { |
| // Make sure we have a struct |
| if (std::string(schema->format) != "+s") { |
| return EINVAL; |
| } |
| |
| out << "{"; |
| |
| // Write length |
| out << R"("count": )" << value->length; |
| |
| // Write children |
| out << R"(, "columns": )"; |
| NANOARROW_RETURN_NOT_OK(WriteChildren(out, schema, value)); |
| |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Write a column to out |
| /// |
| /// Creates output like `{"name": "col", "count": 123, "VALIDITY": [...], ...}`. |
| ArrowErrorCode WriteColumn(std::ostream& out, const ArrowSchema* field, |
| const ArrowArrayView* value) { |
| out << "{"; |
| |
| // Write schema->name (may be null) |
| if (field->name == nullptr) { |
| out << R"("name": null)"; |
| } else { |
| out << R"("name": )"; |
| WriteString(out, ArrowCharView(field->name)); |
| } |
| |
| // Write length |
| out << R"(, "count": )" << value->length; |
| |
| // Write the VALIDITY element if required |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_NA: |
| case NANOARROW_TYPE_DENSE_UNION: |
| case NANOARROW_TYPE_SPARSE_UNION: |
| break; |
| default: |
| out << R"(, "VALIDITY": )"; |
| WriteBitmap(out, value->buffer_views[0].data.as_uint8, value->length); |
| break; |
| } |
| |
| // Write the TYPE_ID element if required |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_SPARSE_UNION: |
| case NANOARROW_TYPE_DENSE_UNION: |
| out << R"(, "TYPE_ID": )"; |
| NANOARROW_RETURN_NOT_OK(WriteOffsetOrTypeID<int8_t>(out, value->buffer_views[0])); |
| break; |
| default: |
| break; |
| } |
| |
| // Write the OFFSET element if required |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_BINARY: |
| case NANOARROW_TYPE_STRING: |
| case NANOARROW_TYPE_DENSE_UNION: |
| case NANOARROW_TYPE_LIST: |
| out << R"(, "OFFSET": )"; |
| NANOARROW_RETURN_NOT_OK( |
| WriteOffsetOrTypeID<int32_t>(out, value->buffer_views[1])); |
| break; |
| case NANOARROW_TYPE_LARGE_LIST: |
| case NANOARROW_TYPE_LARGE_BINARY: |
| case NANOARROW_TYPE_LARGE_STRING: |
| out << R"(, "OFFSET": )"; |
| NANOARROW_RETURN_NOT_OK( |
| WriteOffsetOrTypeID<int64_t>(out, value->buffer_views[1])); |
| break; |
| default: |
| break; |
| } |
| |
| // Write the DATA element if required |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_NA: |
| case NANOARROW_TYPE_STRUCT: |
| case NANOARROW_TYPE_LIST: |
| case NANOARROW_TYPE_LARGE_LIST: |
| case NANOARROW_TYPE_FIXED_SIZE_LIST: |
| case NANOARROW_TYPE_MAP: |
| case NANOARROW_TYPE_DENSE_UNION: |
| case NANOARROW_TYPE_SPARSE_UNION: |
| break; |
| default: |
| out << R"(, "DATA": )"; |
| NANOARROW_RETURN_NOT_OK(WriteData(out, value)); |
| break; |
| } |
| |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_STRUCT: |
| case NANOARROW_TYPE_LIST: |
| case NANOARROW_TYPE_LARGE_LIST: |
| case NANOARROW_TYPE_FIXED_SIZE_LIST: |
| case NANOARROW_TYPE_DENSE_UNION: |
| case NANOARROW_TYPE_SPARSE_UNION: |
| out << R"(, "children": )"; |
| NANOARROW_RETURN_NOT_OK(WriteChildren(out, field, value)); |
| break; |
| default: |
| break; |
| } |
| |
| out << "}"; |
| |
| // Write the dictionary values to the DictionaryContext for later if applicable |
| if (field->dictionary != nullptr) { |
| if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) { |
| return EINVAL; |
| } |
| |
| std::stringstream dictionary_output; |
| NANOARROW_RETURN_NOT_OK( |
| WriteColumn(dictionary_output, field->dictionary, value->dictionary)); |
| dictionaries_.RecordArray(field->dictionary, value->dictionary->length, |
| dictionary_output.str()); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteDictionaryBatches(std::ostream& out) { |
| std::vector<int32_t> ids = dictionaries_.GetAllIds(); |
| if (ids.empty()) { |
| out << "[]"; |
| return NANOARROW_OK; |
| } |
| |
| out << "["; |
| std::sort(ids.begin(), ids.end()); |
| NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[0])); |
| for (size_t i = 1; i < ids.size(); i++) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[i])); |
| } |
| out << "]"; |
| |
| return NANOARROW_OK; |
| } |
| |
| private: |
| int float_precision_; |
| bool include_metadata_; |
| internal::DictionaryContext dictionaries_; |
| |
| bool ShouldWriteMetadata(const char* metadata) { |
| return metadata != nullptr && include_metadata_; |
| } |
| |
| ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t dictionary_id) { |
| const internal::Dictionary& dict = dictionaries_.Get(dictionary_id); |
| out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )" |
| << dict.column_length << R"(, "columns": [)" << dict.column_json << "]}}"; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteFieldChildren(std::ostream& out, const ArrowSchema* field) { |
| if (field->n_children == 0) { |
| out << "[]"; |
| } else { |
| out << "["; |
| NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0])); |
| for (int64_t i = 1; i < field->n_children; i++) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i])); |
| } |
| out << "]"; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) { |
| out << "{"; |
| |
| switch (field->type) { |
| case NANOARROW_TYPE_NA: |
| out << R"("name": "null")"; |
| break; |
| case NANOARROW_TYPE_BOOL: |
| out << R"("name": "bool")"; |
| break; |
| case NANOARROW_TYPE_INT8: |
| case NANOARROW_TYPE_INT16: |
| case NANOARROW_TYPE_INT32: |
| case NANOARROW_TYPE_INT64: |
| out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] |
| << R"(, "isSigned": true)"; |
| break; |
| case NANOARROW_TYPE_UINT8: |
| case NANOARROW_TYPE_UINT16: |
| case NANOARROW_TYPE_UINT64: |
| case NANOARROW_TYPE_UINT32: |
| out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] |
| << R"(, "isSigned": false)"; |
| break; |
| case NANOARROW_TYPE_HALF_FLOAT: |
| out << R"("name": "floatingpoint", "precision": "HALF")"; |
| break; |
| case NANOARROW_TYPE_FLOAT: |
| out << R"("name": "floatingpoint", "precision": "SINGLE")"; |
| break; |
| case NANOARROW_TYPE_DOUBLE: |
| out << R"("name": "floatingpoint", "precision": "DOUBLE")"; |
| break; |
| case NANOARROW_TYPE_STRING: |
| out << R"("name": "utf8")"; |
| break; |
| case NANOARROW_TYPE_LARGE_STRING: |
| out << R"("name": "largeutf8")"; |
| break; |
| case NANOARROW_TYPE_BINARY: |
| out << R"("name": "binary")"; |
| break; |
| case NANOARROW_TYPE_LARGE_BINARY: |
| out << R"("name": "largebinary")"; |
| break; |
| case NANOARROW_TYPE_FIXED_SIZE_BINARY: |
| out << R"("name": "fixedsizebinary", "byteWidth": )" << field->fixed_size; |
| break; |
| case NANOARROW_TYPE_DECIMAL128: |
| case NANOARROW_TYPE_DECIMAL256: |
| out << R"("name": "decimal", "bitWidth": )" << field->decimal_bitwidth |
| << R"(, "precision": )" << field->decimal_precision << R"(, "scale": )" |
| << field->decimal_scale; |
| break; |
| case NANOARROW_TYPE_DURATION: |
| out << R"("name": "duration")"; |
| NANOARROW_RETURN_NOT_OK(WriteTimeUnit(out, field)); |
| break; |
| case NANOARROW_TYPE_DATE32: |
| out << R"("name": "date", "unit": "DAY")"; |
| break; |
| case NANOARROW_TYPE_DATE64: |
| out << R"("name": "date", "unit": "MILLISECOND")"; |
| break; |
| case NANOARROW_TYPE_TIME32: |
| out << R"("name": "time")"; |
| NANOARROW_RETURN_NOT_OK(WriteTimeUnit(out, field)); |
| out << R"(, "bitWidth": 32)"; |
| break; |
| case NANOARROW_TYPE_TIME64: |
| out << R"("name": "time")"; |
| NANOARROW_RETURN_NOT_OK(WriteTimeUnit(out, field)); |
| out << R"(, "bitWidth": 64)"; |
| break; |
| case NANOARROW_TYPE_TIMESTAMP: |
| out << R"("name": "timestamp")"; |
| NANOARROW_RETURN_NOT_OK(WriteTimeUnit(out, field)); |
| if (strlen(field->timezone) > 0) { |
| out << R"(, "timezone": )"; |
| WriteString(out, ArrowCharView(field->timezone)); |
| } |
| break; |
| case NANOARROW_TYPE_INTERVAL_MONTHS: |
| out << R"("name": "interval", "unit": "YEAR_MONTH")"; |
| break; |
| case NANOARROW_TYPE_INTERVAL_DAY_TIME: |
| out << R"("name": "interval", "unit": "DAY_TIME")"; |
| break; |
| case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: |
| out << R"("name": "interval", "unit": "MONTH_DAY_NANO")"; |
| break; |
| case NANOARROW_TYPE_STRUCT: |
| out << R"("name": "struct")"; |
| break; |
| case NANOARROW_TYPE_LIST: |
| out << R"("name": "list")"; |
| break; |
| case NANOARROW_TYPE_MAP: |
| out << R"("name": "map", "keysSorted": )"; |
| if (field->schema->flags & ARROW_FLAG_MAP_KEYS_SORTED) { |
| out << "true"; |
| } else { |
| out << "false"; |
| } |
| break; |
| case NANOARROW_TYPE_LARGE_LIST: |
| out << R"("name": "largelist")"; |
| break; |
| case NANOARROW_TYPE_FIXED_SIZE_LIST: |
| out << R"("name": "fixedsizelist", "listSize": )" |
| << field->layout.child_size_elements; |
| break; |
| case NANOARROW_TYPE_DENSE_UNION: |
| out << R"("name": "union", "mode": "DENSE", "typeIds": [)" |
| << field->union_type_ids << "]"; |
| break; |
| case NANOARROW_TYPE_SPARSE_UNION: |
| out << R"("name": "union", "mode": "SPARSE", "typeIds": [)" |
| << field->union_type_ids << "]"; |
| break; |
| |
| default: |
| // Not supported |
| return ENOTSUP; |
| } |
| |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteTimeUnit(std::ostream& out, const ArrowSchemaView* field) { |
| switch (field->time_unit) { |
| case NANOARROW_TIME_UNIT_NANO: |
| out << R"(, "unit": "NANOSECOND")"; |
| return NANOARROW_OK; |
| case NANOARROW_TIME_UNIT_MICRO: |
| out << R"(, "unit": "MICROSECOND")"; |
| return NANOARROW_OK; |
| case NANOARROW_TIME_UNIT_MILLI: |
| out << R"(, "unit": "MILLISECOND")"; |
| return NANOARROW_OK; |
| case NANOARROW_TIME_UNIT_SECOND: |
| out << R"(, "unit": "SECOND")"; |
| return NANOARROW_OK; |
| default: |
| return EINVAL; |
| } |
| } |
| |
| ArrowErrorCode WriteFieldDictionary(std::ostream& out, int32_t dictionary_id, |
| bool is_ordered, |
| const ArrowSchemaView* indices_field) { |
| out << "{"; |
| |
| out << R"("id": )" << dictionary_id; |
| |
| out << R"(, "indexType": )"; |
| NANOARROW_RETURN_NOT_OK(WriteType(out, indices_field)); |
| |
| if (is_ordered) { |
| out << R"(, "isOrdered": true)"; |
| } else { |
| out << R"(, "isOrdered": false)"; |
| } |
| |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader* reader) { |
| ArrowStringView key; |
| ArrowStringView value; |
| NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderRead(reader, &key, &value)); |
| out << R"({"key": )"; |
| WriteString(out, key); |
| out << R"(, "value": )"; |
| WriteString(out, value); |
| out << "}"; |
| return NANOARROW_OK; |
| } |
| |
| void WriteBitmap(std::ostream& out, const uint8_t* bits, int64_t length) { |
| if (length == 0) { |
| out << "[]"; |
| return; |
| } |
| |
| out << "["; |
| |
| if (bits == nullptr) { |
| out << "1"; |
| for (int64_t i = 1; i < length; i++) { |
| out << ", 1"; |
| } |
| } else { |
| out << static_cast<int32_t>(ArrowBitGet(bits, 0)); |
| for (int64_t i = 1; i < length; i++) { |
| out << ", " << static_cast<int32_t>(ArrowBitGet(bits, i)); |
| } |
| } |
| |
| out << "]"; |
| } |
| |
| template <typename T> |
| ArrowErrorCode WriteOffsetOrTypeID(std::ostream& out, ArrowBufferView content) { |
| if (content.size_bytes == 0) { |
| out << "[]"; |
| return NANOARROW_OK; |
| } |
| |
| const T* values = reinterpret_cast<const T*>(content.data.data); |
| int64_t n_values = content.size_bytes / sizeof(T); |
| |
| out << "["; |
| |
| if (sizeof(T) == sizeof(int64_t)) { |
| // Ensure int64s are quoted (i.e, "123456") |
| out << R"(")" << values[0] << R"(")"; |
| for (int64_t i = 1; i < n_values; i++) { |
| out << R"(, ")" << values[i] << R"(")"; |
| } |
| } else { |
| // No need to quote smaller ints (i.e., 123456) |
| out << static_cast<int64_t>(values[0]); |
| for (int64_t i = 1; i < n_values; i++) { |
| out << ", " << static_cast<int64_t>(values[i]); |
| } |
| } |
| |
| out << "]"; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteData(std::ostream& out, const ArrowArrayView* value) { |
| if (value->length == 0) { |
| out << "[]"; |
| return NANOARROW_OK; |
| } |
| |
| out << "["; |
| |
| switch (value->storage_type) { |
| case NANOARROW_TYPE_BOOL: |
| case NANOARROW_TYPE_INT8: |
| case NANOARROW_TYPE_UINT8: |
| case NANOARROW_TYPE_INT16: |
| case NANOARROW_TYPE_UINT16: |
| case NANOARROW_TYPE_INT32: |
| case NANOARROW_TYPE_UINT32: |
| case NANOARROW_TYPE_INTERVAL_MONTHS: |
| // Regular JSON integers (i.e., 123456) |
| WriteIntMaybeNull(out, value, 0); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteIntMaybeNull(out, value, i); |
| } |
| break; |
| case NANOARROW_TYPE_INT64: |
| // Quoted integers to avoid overflow (i.e., "123456") |
| WriteQuotedIntMaybeNull(out, value, 0); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteQuotedIntMaybeNull(out, value, i); |
| } |
| break; |
| case NANOARROW_TYPE_UINT64: |
| // Quoted integers to avoid overflow (i.e., "123456") |
| WriteQuotedUIntMaybeNull(out, value, 0); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteQuotedUIntMaybeNull(out, value, i); |
| } |
| break; |
| |
| case NANOARROW_TYPE_FLOAT: |
| case NANOARROW_TYPE_DOUBLE: { |
| // JSON number to float_precision_ decimal places |
| LocalizedStream local_stream_opt(out); |
| local_stream_opt.SetFixed(float_precision_); |
| |
| WriteFloatMaybeNull(out, value, 0); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteFloatMaybeNull(out, value, i); |
| } |
| break; |
| } |
| |
| case NANOARROW_TYPE_STRING: |
| case NANOARROW_TYPE_LARGE_STRING: |
| WriteString(out, ArrowArrayViewGetStringUnsafe(value, 0)); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteString(out, ArrowArrayViewGetStringUnsafe(value, i)); |
| } |
| break; |
| |
| case NANOARROW_TYPE_BINARY: |
| case NANOARROW_TYPE_LARGE_BINARY: |
| case NANOARROW_TYPE_FIXED_SIZE_BINARY: { |
| WriteBytesMaybeNull(out, value, 0); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteBytesMaybeNull(out, value, i); |
| } |
| break; |
| } |
| |
| case NANOARROW_TYPE_INTERVAL_DAY_TIME: { |
| ArrowInterval interval; |
| ArrowIntervalInit(&interval, value->storage_type); |
| WriteIntervalDayTimeMaybeNull(out, value, 0, &interval); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteIntervalDayTimeMaybeNull(out, value, i, &interval); |
| } |
| break; |
| } |
| |
| case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: { |
| ArrowInterval interval; |
| ArrowIntervalInit(&interval, value->storage_type); |
| WriteIntervalMonthDayNanoMaybeNull(out, value, 0, &interval); |
| for (int64_t i = 1; i < value->length; i++) { |
| out << ", "; |
| WriteIntervalMonthDayNanoMaybeNull(out, value, i, &interval); |
| } |
| break; |
| } |
| |
| case NANOARROW_TYPE_DECIMAL128: |
| NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 128)); |
| break; |
| case NANOARROW_TYPE_DECIMAL256: |
| NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 256)); |
| break; |
| |
| default: |
| // Not supported |
| return ENOTSUP; |
| } |
| |
| out << "]"; |
| return NANOARROW_OK; |
| } |
| |
| void WriteIntMaybeNull(std::ostream& out, const ArrowArrayView* view, int64_t i) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << 0; |
| } else { |
| out << ArrowArrayViewGetIntUnsafe(view, i); |
| } |
| } |
| |
| void WriteQuotedIntMaybeNull(std::ostream& out, const ArrowArrayView* view, int64_t i) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"("0")"; |
| } else { |
| out << R"(")" << ArrowArrayViewGetIntUnsafe(view, i) << R"(")"; |
| } |
| } |
| |
| void WriteQuotedUIntMaybeNull(std::ostream& out, const ArrowArrayView* view, |
| int64_t i) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"("0")"; |
| } else { |
| out << R"(")" << ArrowArrayViewGetUIntUnsafe(view, i) << R"(")"; |
| } |
| } |
| |
| void WriteFloatMaybeNull(std::ostream& out, const ArrowArrayView* view, int64_t i) { |
| if (float_precision_ >= 0) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << static_cast<double>(0); |
| } else { |
| out << ArrowArrayViewGetDoubleUnsafe(view, i); |
| } |
| } else { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << "0.0"; |
| } else { |
| out << nlohmann::json(ArrowArrayViewGetDoubleUnsafe(view, i)); |
| } |
| } |
| } |
| |
| void WriteBytesMaybeNull(std::ostream& out, const ArrowArrayView* view, int64_t i) { |
| ArrowBufferView item = ArrowArrayViewGetBytesUnsafe(view, i); |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"(")"; |
| for (int64_t i = 0; i < item.size_bytes; i++) { |
| out << "00"; |
| } |
| out << R"(")"; |
| } else { |
| WriteBytes(out, item); |
| } |
| } |
| |
| void WriteIntervalDayTimeMaybeNull(std::ostream& out, const ArrowArrayView* view, |
| int64_t i, ArrowInterval* interval) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"({"days": 0, "milliseconds": 0})"; |
| } else { |
| ArrowArrayViewGetIntervalUnsafe(view, i, interval); |
| out << R"({"days": )" << interval->days << R"(, "milliseconds": )" << interval->ms |
| << "}"; |
| } |
| } |
| |
| void WriteIntervalMonthDayNanoMaybeNull(std::ostream& out, const ArrowArrayView* view, |
| int64_t i, ArrowInterval* interval) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"({"months": 0, "days": 0, "nanoseconds": "0"})"; |
| } else { |
| ArrowArrayViewGetIntervalUnsafe(view, i, interval); |
| out << R"({"months": )" << interval->months << R"(, "days": )" << interval->days |
| << R"(, "nanoseconds": ")" << interval->ns << R"("})"; |
| } |
| } |
| |
| ArrowErrorCode WriteDecimalData(std::ostream& out, const ArrowArrayView* view, |
| int bitwidth) { |
| ArrowDecimal value; |
| ArrowDecimalInit(&value, bitwidth, 0, 0); |
| nanoarrow::UniqueBuffer tmp; |
| |
| NANOARROW_RETURN_NOT_OK(WriteDecimalMaybeNull(out, view, 0, &value, tmp.get())); |
| for (int64_t i = 1; i < view->length; i++) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteDecimalMaybeNull(out, view, i, &value, tmp.get())); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode WriteDecimalMaybeNull(std::ostream& out, const ArrowArrayView* view, |
| int64_t i, ArrowDecimal* decimal, |
| ArrowBuffer* tmp) { |
| if (ArrowArrayViewIsNull(view, i)) { |
| out << R"("0")"; |
| return NANOARROW_OK; |
| } else { |
| ArrowArrayViewGetDecimalUnsafe(view, i, decimal); |
| tmp->size_bytes = 0; |
| NANOARROW_RETURN_NOT_OK(ArrowDecimalAppendDigitsToBuffer(decimal, tmp)); |
| out << R"(")" << std::string(reinterpret_cast<char*>(tmp->data), tmp->size_bytes) |
| << R"(")"; |
| return NANOARROW_OK; |
| } |
| } |
| |
| void WriteString(std::ostream& out, ArrowStringView value) { |
| out << R"(")"; |
| |
| for (int64_t i = 0; i < value.size_bytes; i++) { |
| char c = value.data[i]; |
| if (c == '"') { |
| out << R"(\")"; |
| } else if (c == '\\') { |
| out << R"(\\)"; |
| } else if (c >= 0 && c < 32) { |
| // Control characters need to be escaped with a \uXXXX escape |
| uint16_t utf16_bytes = static_cast<uint16_t>(c); |
| |
| char utf16_esc[7]; |
| utf16_esc[6] = '\0'; |
| snprintf(utf16_esc, sizeof(utf16_esc), R"(\u%04x)", utf16_bytes); |
| out << utf16_esc; |
| } else { |
| out << c; |
| } |
| } |
| |
| out << R"(")"; |
| } |
| |
| void WriteBytes(std::ostream& out, ArrowBufferView value) { |
| out << R"(")"; |
| char hex[3]; |
| hex[2] = '\0'; |
| |
| for (int64_t i = 0; i < value.size_bytes; i++) { |
| snprintf(hex, sizeof(hex), "%02X", static_cast<int>(value.data.as_uint8[i])); |
| out << hex; |
| } |
| out << R"(")"; |
| } |
| |
| ArrowErrorCode WriteChildren(std::ostream& out, const ArrowSchema* field, |
| const ArrowArrayView* value) { |
| if (field->n_children == 0) { |
| out << "[]"; |
| return NANOARROW_OK; |
| } |
| |
| out << "["; |
| NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[0], value->children[0])); |
| for (int64_t i = 1; i < field->n_children; i++) { |
| out << ", "; |
| NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[i], value->children[i])); |
| } |
| out << "]"; |
| return NANOARROW_OK; |
| } |
| |
| class LocalizedStream { |
| public: |
| LocalizedStream(std::ostream& out) : out_(out) { |
| previous_locale_ = out.imbue(std::locale::classic()); |
| previous_precision_ = out.precision(); |
| fmt_flags_ = out.flags(); |
| out.setf(out.fixed); |
| } |
| |
| void SetFixed(int precision) { out_.precision(precision); } |
| |
| ~LocalizedStream() { |
| out_.flags(fmt_flags_); |
| out_.precision(previous_precision_); |
| out_.imbue(previous_locale_); |
| } |
| |
| private: |
| std::ostream& out_; |
| std::locale previous_locale_; |
| std::ios::fmtflags fmt_flags_; |
| std::streamsize previous_precision_; |
| }; |
| }; |
| |
| /// \brief Reader for the Arrow integration testing JSON format |
| class TestingJSONReader { |
| using json = nlohmann::json; |
| |
| public: |
| TestingJSONReader(ArrowBufferAllocator allocator) : allocator_(allocator) {} |
| TestingJSONReader() : TestingJSONReader(ArrowBufferAllocatorDefault()) {} |
| |
| static const int kNumBatchOnlySchema = -2; |
| static const int kNumBatchReadAll = -1; |
| |
| /// \brief Read JSON representing a data file object |
| /// |
| /// Read a JSON object in the form `{"schema": {...}, "batches": [...], ...}`, |
| /// propagating `out` on success. |
| ArrowErrorCode ReadDataFile(const std::string& data_file_json, ArrowArrayStream* out, |
| int num_batch = kNumBatchReadAll, |
| ArrowError* error = nullptr) { |
| dictionaries_.clear(); |
| |
| try { |
| auto obj = json::parse(data_file_json); |
| NANOARROW_RETURN_NOT_OK(Check(obj.is_object(), error, "data file must be object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(obj.contains("schema"), error, "data file missing key 'schema'")); |
| |
| // Read Schema |
| nanoarrow::UniqueSchema schema; |
| NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj["schema"], error)); |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(obj.contains("batches"), error, "data file missing key 'batches'")); |
| const auto& batches = obj["batches"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(batches.is_array(), error, "data file batches must be array")); |
| |
| // Populate ArrayView |
| nanoarrow::UniqueArrayView array_view; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), error)); |
| |
| // Record any dictionaries that might be present |
| if (obj.contains("dictionaries")) { |
| NANOARROW_RETURN_NOT_OK(RecordDictionaryBatches(obj["dictionaries"], error)); |
| } |
| |
| // Get a vector of batch ids to parse |
| std::vector<size_t> batch_ids; |
| if (num_batch == kNumBatchOnlySchema) { |
| batch_ids.resize(0); |
| } else if (num_batch == kNumBatchReadAll) { |
| batch_ids.resize(batches.size()); |
| std::iota(batch_ids.begin(), batch_ids.end(), 0); |
| } else if (num_batch >= 0 && static_cast<size_t>(num_batch) < batches.size()) { |
| batch_ids.push_back(num_batch); |
| } else { |
| ArrowErrorSet(error, "Expected num_batch between 0 and %d but got %d", |
| static_cast<int>(batches.size() - 1), num_batch); |
| return EINVAL; |
| } |
| |
| // Initialize ArrayStream with required capacity |
| nanoarrow::UniqueArrayStream stream; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowBasicArrayStreamInit(stream.get(), schema.get(), batch_ids.size()), error); |
| |
| // Populate ArrayStream batches |
| for (size_t i = 0; i < batch_ids.size(); i++) { |
| nanoarrow::UniqueArray array; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayInitFromArrayView(array.get(), array_view.get(), error)); |
| SetArrayAllocatorRecursive(array.get()); |
| NANOARROW_RETURN_NOT_OK(SetArrayBatch(batches[batch_ids[i]], schema.get(), |
| array_view.get(), array.get(), error)); |
| ArrowBasicArrayStreamSetArray(stream.get(), i, array.get()); |
| } |
| |
| ArrowArrayStreamMove(stream.get(), out); |
| return NANOARROW_OK; |
| } catch (json::exception& e) { |
| ArrowErrorSet(error, "Exception in TestingJSONReader::ReadDataFile(): %s", |
| e.what()); |
| return EINVAL; |
| } |
| } |
| |
| /// \brief Read JSON representing a Schema |
| /// |
| /// Reads a JSON object in the form `{"fields": [...], "metadata": [...]}`, |
| /// propagating `out` on success. |
| ArrowErrorCode ReadSchema(const std::string& schema_json, ArrowSchema* out, |
| ArrowError* error = nullptr) { |
| try { |
| auto obj = json::parse(schema_json); |
| nanoarrow::UniqueSchema schema; |
| |
| NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); |
| ArrowSchemaMove(schema.get(), out); |
| return NANOARROW_OK; |
| } catch (json::exception& e) { |
| ArrowErrorSet(error, "Exception in TestingJSONReader::ReadSchema(): %s", e.what()); |
| return EINVAL; |
| } |
| } |
| |
| /// \brief Read JSON representing a Field |
| /// |
| /// Read a JSON object in the form `{"name" : "col", "type": {...}, ...}`, |
| /// propagating `out` on success. |
| ArrowErrorCode ReadField(const std::string& field_json, ArrowSchema* out, |
| ArrowError* error = nullptr) { |
| try { |
| auto obj = json::parse(field_json); |
| nanoarrow::UniqueSchema schema; |
| |
| NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); |
| ArrowSchemaMove(schema.get(), out); |
| return NANOARROW_OK; |
| } catch (json::exception& e) { |
| ArrowErrorSet(error, "Exception in TestingJSONReader::ReadField(): %s", e.what()); |
| return EINVAL; |
| } |
| } |
| |
| /// \brief Read JSON representing a RecordBatch |
| /// |
| /// Read a JSON object in the form `{"count": 123, "columns": [...]}`, propagating `out` |
| /// on success. |
| ArrowErrorCode ReadBatch(const std::string& batch_json, const ArrowSchema* schema, |
| ArrowArray* out, ArrowError* error = nullptr) { |
| try { |
| auto obj = json::parse(batch_json); |
| |
| // ArrowArrayView to enable validation |
| nanoarrow::UniqueArrayView array_view; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(array_view.get(), schema, error)); |
| |
| // ArrowArray to hold memory |
| nanoarrow::UniqueArray array; |
| NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, error)); |
| SetArrayAllocatorRecursive(array.get()); |
| |
| NANOARROW_RETURN_NOT_OK( |
| SetArrayBatch(obj, schema, array_view.get(), array.get(), error)); |
| ArrowArrayMove(array.get(), out); |
| return NANOARROW_OK; |
| } catch (json::exception& e) { |
| ArrowErrorSet(error, "Exception in TestingJSONReader::ReadBatch(): %s", e.what()); |
| return EINVAL; |
| } |
| } |
| |
| /// \brief Read JSON representing a Column |
| /// |
| /// Read a JSON object in the form |
| /// `{"name": "col", "count": 123, "VALIDITY": [...], ...}`, propagating |
| /// `out` on success. |
| ArrowErrorCode ReadColumn(const std::string& column_json, const ArrowSchema* schema, |
| ArrowArray* out, ArrowError* error = nullptr) { |
| try { |
| auto obj = json::parse(column_json); |
| |
| // ArrowArrayView to enable validation |
| nanoarrow::UniqueArrayView array_view; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(array_view.get(), schema, error)); |
| |
| // ArrowArray to hold memory |
| nanoarrow::UniqueArray array; |
| NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, error)); |
| SetArrayAllocatorRecursive(array.get()); |
| |
| // Parse the JSON into the array |
| NANOARROW_RETURN_NOT_OK( |
| SetArrayColumn(obj, schema, array_view.get(), array.get(), error)); |
| |
| // Return the result |
| ArrowArrayMove(array.get(), out); |
| return NANOARROW_OK; |
| } catch (json::exception& e) { |
| ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); |
| return EINVAL; |
| } |
| } |
| |
| private: |
| ArrowBufferAllocator allocator_; |
| internal::DictionaryContext dictionaries_; |
| |
| ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_object(), error, "Expected Schema to be a JSON object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("fields"), error, "Schema missing key 'fields'")); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error); |
| |
| // Top-level schema is non-nullable |
| schema->flags = 0; |
| |
| const auto& fields = value["fields"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(fields.is_array(), error, "Schema fields must be array")); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateChildren(schema, fields.size()), |
| error); |
| for (int64_t i = 0; i < schema->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], fields[i], error)); |
| } |
| |
| if (value.contains("metadata")) { |
| NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); |
| } |
| |
| // Validate! |
| ArrowSchemaView schema_view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_object(), error, "Expected Field to be a JSON object")); |
| ArrowSchemaInit(schema); |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("name"), error, "Field missing key 'name'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("type"), error, "Field missing key 'type'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("nullable"), error, "Field missing key 'nullable'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("children"), error, "Field missing key 'children'")); |
| |
| // Name |
| const auto& name = value["name"]; |
| NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error, |
| "Field name must be string or null")); |
| if (name.is_string()) { |
| auto name_str = name.get<std::string>(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetName(schema, name_str.c_str()), |
| error); |
| } |
| |
| // Nullability |
| const auto& nullable = value["nullable"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(nullable.is_boolean(), error, "Field nullable must be boolean")); |
| if (nullable.get<bool>()) { |
| schema->flags |= ARROW_FLAG_NULLABLE; |
| } else { |
| schema->flags &= ~ARROW_FLAG_NULLABLE; |
| } |
| |
| // Metadata |
| if (value.contains("metadata")) { |
| NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); |
| } |
| |
| // If we have a dictionary, this value needs to be in schema->dictionary |
| // and value["dictionary"] needs to be in schema |
| if (value.contains("dictionary")) { |
| // Put the index type in this schema |
| int32_t dictionary_id; |
| NANOARROW_RETURN_NOT_OK( |
| SetDictionary(schema, value["dictionary"], &dictionary_id, error)); |
| |
| // Allocate a dictionary and put this value (minus dictionary, metadata, and name) |
| json value_copy = value; |
| value_copy.erase("dictionary"); |
| value_copy.erase("metadata"); |
| value_copy["name"] = nullptr; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateDictionary(schema), error); |
| NANOARROW_RETURN_NOT_OK(SetField(schema->dictionary, value_copy, error)); |
| |
| // Keep track of this dictionary_id/schema for parsing batches |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| dictionaries_.RecordSchema(dictionary_id, schema->dictionary), error); |
| |
| // Validate! |
| ArrowSchemaView schema_view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); |
| |
| return NANOARROW_OK; |
| } |
| |
| NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error)); |
| |
| const auto& children = value["children"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(children.is_array(), error, "Field children must be array")); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaAllocateChildren(schema, children.size()), error); |
| for (int64_t i = 0; i < schema->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i], error)); |
| } |
| |
| // Validate! |
| ArrowSchemaView schema_view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetDictionary(ArrowSchema* schema, const json& value, |
| int32_t* dictionary_id, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Dictionary must be object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("id"), error, "Dictionary missing key 'id'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("indexType"), error, "Dictionary missing key 'type'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("isOrdered"), error, "Dictionary missing key 'isOrdered'")); |
| |
| const auto& id = value["id"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(id.is_number_integer(), error, "Dictionary id must be integer")); |
| *dictionary_id = id.get<int32_t>(); |
| |
| // Parse the index type |
| NANOARROW_RETURN_NOT_OK(SetType(schema, value["indexType"], error)); |
| |
| // Set the flag |
| const auto& is_ordered = value["isOrdered"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(is_ordered.is_boolean(), error, "Dictionary isOrdered must be bool")); |
| if (is_ordered.get<bool>()) { |
| schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; |
| } else { |
| schema->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("name"), error, "Type missing key 'name'")); |
| |
| const auto& name = value["name"]; |
| NANOARROW_RETURN_NOT_OK(Check(name.is_string(), error, "Type name must be string")); |
| auto name_str = name.get<std::string>(); |
| |
| if (name_str == "null") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA), |
| error); |
| } else if (name_str == "bool") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL), |
| error); |
| } else if (name_str == "int") { |
| NANOARROW_RETURN_NOT_OK(SetTypeInt(schema, value, error)); |
| } else if (name_str == "floatingpoint") { |
| NANOARROW_RETURN_NOT_OK(SetTypeFloatingPoint(schema, value, error)); |
| } else if (name_str == "decimal") { |
| NANOARROW_RETURN_NOT_OK(SetTypeDecimal(schema, value, error)); |
| } else if (name_str == "utf8") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING), error); |
| } else if (name_str == "largeutf8") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_STRING), error); |
| } else if (name_str == "binary") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY), error); |
| } else if (name_str == "largebinary") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_BINARY), error); |
| } else if (name_str == "fixedsizebinary") { |
| NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeBinary(schema, value, error)); |
| } else if (name_str == "list") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+l"), error); |
| } else if (name_str == "largelist") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+L"), error); |
| } else if (name_str == "fixedsizelist") { |
| NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeList(schema, value, error)); |
| } else if (name_str == "date") { |
| NANOARROW_RETURN_NOT_OK(SetTypeDate(schema, value, error)); |
| } else if (name_str == "time") { |
| NANOARROW_RETURN_NOT_OK(SetTypeTime(schema, value, error)); |
| } else if (name_str == "timestamp") { |
| NANOARROW_RETURN_NOT_OK(SetTypeTimestamp(schema, value, error)); |
| } else if (name_str == "duration") { |
| NANOARROW_RETURN_NOT_OK(SetTypeDuration(schema, value, error)); |
| } else if (name_str == "interval") { |
| NANOARROW_RETURN_NOT_OK(SetTypeInterval(schema, value, error)); |
| } else if (name_str == "map") { |
| NANOARROW_RETURN_NOT_OK(SetTypeMap(schema, value, error)); |
| } else if (name_str == "union") { |
| NANOARROW_RETURN_NOT_OK(SetTypeUnion(schema, value, error)); |
| } else if (name_str == "struct") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+s"), error); |
| } else { |
| ArrowErrorSet(error, "Unsupported Type name: '%s'", name_str.c_str()); |
| return ENOTSUP; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeInt(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("bitWidth"), error, |
| "Type[name=='int'] missing key 'bitWidth'")); |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("isSigned"), error, |
| "Type[name=='int'] missing key 'isSigned'")); |
| |
| const auto& bitwidth = value["bitWidth"]; |
| NANOARROW_RETURN_NOT_OK(Check(bitwidth.is_number_integer(), error, |
| "Type[name=='int'] bitWidth must be integer")); |
| |
| const auto& issigned = value["isSigned"]; |
| NANOARROW_RETURN_NOT_OK(Check(issigned.is_boolean(), error, |
| "Type[name=='int'] isSigned must be boolean")); |
| |
| ArrowType type = NANOARROW_TYPE_UNINITIALIZED; |
| if (issigned.get<bool>()) { |
| switch (bitwidth.get<int>()) { |
| case 8: |
| type = NANOARROW_TYPE_INT8; |
| break; |
| case 16: |
| type = NANOARROW_TYPE_INT16; |
| break; |
| case 32: |
| type = NANOARROW_TYPE_INT32; |
| break; |
| case 64: |
| type = NANOARROW_TYPE_INT64; |
| break; |
| default: |
| ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); |
| return EINVAL; |
| } |
| } else { |
| switch (bitwidth.get<int>()) { |
| case 8: |
| type = NANOARROW_TYPE_UINT8; |
| break; |
| case 16: |
| type = NANOARROW_TYPE_UINT16; |
| break; |
| case 32: |
| type = NANOARROW_TYPE_UINT32; |
| break; |
| case 64: |
| type = NANOARROW_TYPE_UINT64; |
| break; |
| default: |
| ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); |
| return EINVAL; |
| } |
| } |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeFloatingPoint(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, |
| "Type[name=='floatingpoint'] missing key 'precision'")); |
| |
| const auto& precision = value["precision"]; |
| NANOARROW_RETURN_NOT_OK(Check(precision.is_string(), error, |
| "Type[name=='floatingpoint'] bitWidth must be string")); |
| |
| ArrowType type = NANOARROW_TYPE_UNINITIALIZED; |
| auto precision_str = precision.get<std::string>(); |
| if (precision_str == "HALF") { |
| type = NANOARROW_TYPE_HALF_FLOAT; |
| } else if (precision_str == "SINGLE") { |
| type = NANOARROW_TYPE_FLOAT; |
| } else if (precision_str == "DOUBLE") { |
| type = NANOARROW_TYPE_DOUBLE; |
| } else { |
| ArrowErrorSet( |
| error, |
| "Type[name=='floatingpoint'] precision must be 'HALF', 'SINGLE', or 'DOUBLE'"); |
| return EINVAL; |
| } |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeFixedSizeBinary(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("byteWidth"), error, |
| "Type[name=='fixedsizebinary'] missing key 'byteWidth'")); |
| |
| const auto& byteWidth = value["byteWidth"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(byteWidth.is_number_integer(), error, |
| "Type[name=='fixedsizebinary'] byteWidth must be integer")); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeFixedSize(schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, |
| byteWidth.get<int>()), |
| error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeDecimal(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, |
| "Type[name=='decimal'] missing key 'precision'")); |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("scale"), error, |
| "Type[name=='decimal'] missing key 'scale'")); |
| |
| // Some test files omit bitWidth for decimal128 |
| int bit_width_int; |
| if (value.contains("bitWidth")) { |
| const auto& bit_width = value["bitWidth"]; |
| NANOARROW_RETURN_NOT_OK(Check(bit_width.is_number_integer(), error, |
| "Type[name=='decimal'] bitWidth must be integer")); |
| bit_width_int = bit_width.get<int>(); |
| } else { |
| bit_width_int = 128; |
| } |
| |
| ArrowType type; |
| switch (bit_width_int) { |
| case 128: |
| type = NANOARROW_TYPE_DECIMAL128; |
| break; |
| case 256: |
| type = NANOARROW_TYPE_DECIMAL256; |
| break; |
| default: |
| ArrowErrorSet(error, "Type[name=='decimal'] bitWidth must be 128 or 256"); |
| return EINVAL; |
| } |
| |
| const auto& precision = value["precision"]; |
| NANOARROW_RETURN_NOT_OK(Check(precision.is_number_integer(), error, |
| "Type[name=='decimal'] precision must be integer")); |
| |
| const auto& scale = value["scale"]; |
| NANOARROW_RETURN_NOT_OK(Check(scale.is_number_integer(), error, |
| "Type[name=='decimal'] scale must be integer")); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeDecimal(schema, type, precision.get<int>(), scale.get<int>()), |
| error); |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeDate(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("unit"), error, "Type[name=='date'] missing key 'unit'")); |
| const auto& unit = value["unit"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(unit.is_string(), error, "Type[name=='date'] unit must be string")); |
| std::string unit_str = unit.get<std::string>(); |
| |
| if (unit_str == "DAY") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_DATE32), error); |
| } else if (unit_str == "MILLISECOND") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_DATE64), error); |
| } else { |
| ArrowErrorSet(error, "Type[name=='date'] unit must be 'DAY' or 'MILLISECOND'"); |
| return EINVAL; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeTime(ArrowSchema* schema, const json& value, ArrowError* error) { |
| ArrowTimeUnit time_unit; |
| NANOARROW_RETURN_NOT_OK(SetTimeUnit(value, &time_unit, error)); |
| |
| const auto& bit_width = value["bitWidth"]; |
| NANOARROW_RETURN_NOT_OK(Check(bit_width.is_number_integer(), error, |
| "Type[name=='time'] bitWidth must be integer")); |
| auto bit_width_int = bit_width.get<int>(); |
| |
| if (bit_width_int == 32) { |
| NANOARROW_RETURN_NOT_OK(Check( |
| time_unit == NANOARROW_TIME_UNIT_SECOND || |
| time_unit == NANOARROW_TIME_UNIT_MILLI, |
| error, "Expected time unit of 'SECOND' or 'MILLISECOND' for bitWidth 32")); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIME32, time_unit, nullptr), |
| error); |
| return NANOARROW_OK; |
| } else if (bit_width_int == 64) { |
| NANOARROW_RETURN_NOT_OK(Check( |
| time_unit == NANOARROW_TIME_UNIT_MICRO || time_unit == NANOARROW_TIME_UNIT_NANO, |
| error, "Expected time unit of 'MICROSECOND' or 'NANOSECOND' for bitWidth 64")); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIME64, time_unit, nullptr), |
| error); |
| return NANOARROW_OK; |
| } else { |
| ArrowErrorSet(error, "Expected Type[name=='time'] bitWidth of 32 or 64"); |
| return EINVAL; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeTimestamp(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| ArrowTimeUnit time_unit; |
| NANOARROW_RETURN_NOT_OK(SetTimeUnit(value, &time_unit, error)); |
| |
| std::string timezone_str; |
| if (value.contains("timezone")) { |
| const auto& timezone = value["timezone"]; |
| NANOARROW_RETURN_NOT_OK(Check(timezone.is_string(), error, |
| "Type[name=='timestamp'] timezone must be string")); |
| timezone_str = timezone.get<std::string>(); |
| } |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_TIMESTAMP, time_unit, |
| timezone_str.c_str()), |
| error); |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeDuration(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| ArrowTimeUnit time_unit; |
| NANOARROW_RETURN_NOT_OK(SetTimeUnit(value, &time_unit, error)); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetTypeDateTime(schema, NANOARROW_TYPE_DURATION, time_unit, nullptr), |
| error); |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTimeUnit(const json& value, ArrowTimeUnit* time_unit, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("unit"), error, "Time-like type missing key 'unit'")); |
| const auto& unit = value["unit"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(unit.is_string(), error, "Time-like type unit must be string")); |
| std::string unit_str = unit.get<std::string>(); |
| |
| if (unit_str == "SECOND") { |
| *time_unit = NANOARROW_TIME_UNIT_SECOND; |
| } else if (unit_str == "MILLISECOND") { |
| *time_unit = NANOARROW_TIME_UNIT_MILLI; |
| } else if (unit_str == "MICROSECOND") { |
| *time_unit = NANOARROW_TIME_UNIT_MICRO; |
| } else if (unit_str == "NANOSECOND") { |
| *time_unit = NANOARROW_TIME_UNIT_NANO; |
| } else { |
| ArrowErrorSet( |
| error, |
| "TimeUnit must be 'SECOND' or 'MILLISECOND', 'MICROSECOND', or 'NANOSECOND'"); |
| return EINVAL; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeInterval(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("unit"), error, |
| "Type[name=='interval'] missing key 'unit'")); |
| const auto& unit = value["unit"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(unit.is_string(), error, "Type[name=='interval'] unit must be string")); |
| std::string unit_str = unit.get<std::string>(); |
| |
| if (unit_str == "YEAR_MONTH") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_INTERVAL_MONTHS), error); |
| } else if (unit_str == "DAY_TIME") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_INTERVAL_DAY_TIME), error); |
| } else if (unit_str == "MONTH_DAY_NANO") { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetType(schema, NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO), error); |
| } else { |
| ArrowErrorSet(error, |
| "Type[name=='interval'] unit must be 'YEAR_MONTH', 'DAY_TIME', or " |
| "'MONTH_DAY_NANO'"); |
| return EINVAL; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeMap(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("keysSorted"), error, |
| "Type[name=='map'] missing key 'keysSorted'")); |
| |
| const auto& keys_sorted = value["keysSorted"]; |
| NANOARROW_RETURN_NOT_OK(Check(keys_sorted.is_boolean(), error, |
| "Type[name=='map'] keysSorted must be boolean")); |
| |
| if (keys_sorted.get<bool>()) { |
| schema->flags |= ARROW_FLAG_MAP_KEYS_SORTED; |
| } else { |
| schema->flags &= ~ARROW_FLAG_MAP_KEYS_SORTED; |
| } |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+m"), error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeFixedSizeList(ArrowSchema* schema, const json& value, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("listSize"), error, |
| "Type[name=='fixedsizelist'] missing key 'listSize'")); |
| |
| const auto& list_size = value["listSize"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(list_size.is_number_integer(), error, |
| "Type[name=='fixedsizelist'] listSize must be integer")); |
| |
| std::stringstream format_builder; |
| format_builder << "+w:" << list_size; |
| std::string format = format_builder.str(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, format.c_str()), |
| error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetTypeUnion(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("mode"), error, "Type[name=='union'] missing key 'mode'")); |
| NANOARROW_RETURN_NOT_OK(Check(value.contains("typeIds"), error, |
| "Type[name=='union'] missing key 'typeIds'")); |
| |
| const auto& mode = value["mode"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(mode.is_string(), error, "Type[name=='union'] mode must be string")); |
| |
| auto mode_str = mode.get<std::string>(); |
| std::stringstream type_ids_format; |
| |
| if (mode_str == "DENSE") { |
| type_ids_format << "+ud:"; |
| } else if (mode_str == "SPARSE") { |
| type_ids_format << "+us:"; |
| } else { |
| ArrowErrorSet(error, "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); |
| return EINVAL; |
| } |
| |
| const auto& type_ids = value["typeIds"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(type_ids.is_array(), error, "Type[name=='union'] typeIds must be array")); |
| |
| if (type_ids.size() > 0) { |
| for (size_t i = 0; i < type_ids.size(); i++) { |
| const auto& type_id = type_ids[i]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(type_id.is_number_integer(), error, |
| "Type[name=='union'] typeIds item must be integer")); |
| type_ids_format << type_id; |
| |
| if ((i + 1) < type_ids.size()) { |
| type_ids_format << ","; |
| } |
| } |
| } |
| |
| std::string type_ids_format_str = type_ids_format.str(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetFormat(schema, type_ids_format_str.c_str()), error); |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetMetadata(ArrowSchema* schema, const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.is_null() || value.is_array(), error, |
| "Field or Schema metadata must be null or array")); |
| if (value.is_null()) { |
| return NANOARROW_OK; |
| } |
| |
| nanoarrow::UniqueBuffer metadata; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataBuilderInit(metadata.get(), nullptr), |
| error); |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_object(), error, "metadata item must be object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.contains("key"), error, "metadata item missing key 'key'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.contains("value"), error, "metadata item missing key 'value'")); |
| |
| const auto& key = item["key"]; |
| const auto& value = item["value"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(key.is_string(), error, "metadata item key must be string")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_string(), error, "metadata item value must be string")); |
| |
| auto key_str = key.get<std::string>(); |
| auto value_str = value.get<std::string>(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowMetadataBuilderAppend(metadata.get(), ArrowCharView(key_str.c_str()), |
| ArrowCharView(value_str.c_str())), |
| error); |
| } |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaSetMetadata(schema, reinterpret_cast<char*>(metadata->data)), error); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetArrayBatch(const json& value, const ArrowSchema* schema, |
| ArrowArrayView* array_view, ArrowArray* array, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_object(), error, "Expected RecordBatch to be a JSON object")); |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("count"), error, "RecordBatch missing key 'count'")); |
| |
| const auto& count = value["count"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(count.is_number_integer(), error, "RecordBatch count must be integer")); |
| array_view->length = count.get<int64_t>(); |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("columns"), error, "RecordBatch missing key 'columns'")); |
| |
| const auto& columns = value["columns"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(columns.is_array(), error, "RecordBatch columns must be array")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(columns.size() == static_cast<size_t>(array_view->n_children), error, |
| "RecordBatch children has incorrect size")); |
| |
| for (int64_t i = 0; i < array_view->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(SetArrayColumn(columns[i], schema->children[i], |
| array_view->children[i], array->children[i], |
| error)); |
| } |
| |
| // Validate the array view |
| NANOARROW_RETURN_NOT_OK(PrefixError( |
| ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, |
| "RecordBatch failed to validate: ")); |
| |
| // Flush length and buffer pointers to the Array |
| array->length = array_view->length; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error); |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode RecordDictionaryBatches(const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "dictionaries must be array")); |
| |
| for (const auto& batch : value) { |
| NANOARROW_RETURN_NOT_OK(RecordDictionaryBatch(batch, error)); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode RecordDictionaryBatch(const json& value, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_object(), error, "dictionary batch must be object")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("id"), error, "dictionary batch missing key 'id'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("data"), error, "dictionary batch missing key 'data'")); |
| |
| const auto& id = value["id"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(id.is_number_integer(), error, "dictionary batch id must be integer")); |
| int id_int = id.get<int>(); |
| NANOARROW_RETURN_NOT_OK(Check(dictionaries_.HasDictionaryForId(id_int), error, |
| "dictionary batch has unknown id")); |
| |
| const auto& batch = value["data"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(batch.is_object(), error, "dictionary batch data must be object")); |
| NANOARROW_RETURN_NOT_OK(Check(batch.contains("columns"), error, |
| "dictionary batch missing key 'columns'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(batch.contains("count"), error, "dictionary batch missing key 'count'")); |
| |
| const auto& batch_columns = batch["columns"]; |
| NANOARROW_RETURN_NOT_OK(Check(batch_columns.is_array() && batch_columns.size() == 1, |
| error, |
| "dictionary batch columns must be array of size 1")); |
| |
| const auto& batch_count = batch["count"]; |
| NANOARROW_RETURN_NOT_OK(Check(batch_count.is_number_integer(), error, |
| "dictionary batch count must be integer")); |
| |
| dictionaries_.RecordArray(id_int, batch_count.get<int32_t>(), |
| batch_columns[0].dump()); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetArrayColumn(const json& value, const ArrowSchema* schema, |
| ArrowArrayView* array_view, ArrowArray* array, |
| ArrowError* error, |
| const std::string& parent_error_prefix = "") { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_object(), error, "Expected Column to be a JSON object")); |
| |
| // Check + resolve name early to generate better error messages |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("name"), error, "Column missing key 'name'")); |
| |
| const auto& name = value["name"]; |
| NANOARROW_RETURN_NOT_OK(Check(name.is_null() || name.is_string(), error, |
| "Column name must be string or null")); |
| |
| std::string error_prefix; |
| if (name.is_string()) { |
| error_prefix = parent_error_prefix + "-> Column '" + name.get<std::string>() + "' "; |
| } else { |
| error_prefix = parent_error_prefix + "-> Column <name is null> "; |
| } |
| |
| // Check, resolve, and recurse children |
| NANOARROW_RETURN_NOT_OK( |
| Check(array_view->n_children == 0 || value.contains("children"), error, |
| error_prefix + "missing key children")); |
| |
| if (value.contains("children")) { |
| const auto& children = value["children"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(children.is_array(), error, error_prefix + "children must be array")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(children.size() == static_cast<size_t>(array_view->n_children), error, |
| error_prefix + "children has incorrect size")); |
| |
| for (int64_t i = 0; i < array_view->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], schema->children[i], |
| array_view->children[i], |
| array->children[i], error, error_prefix)); |
| } |
| } |
| |
| // Build buffers |
| for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { |
| NANOARROW_RETURN_NOT_OK( |
| PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, |
| error_prefix)); |
| } |
| |
| // Check + resolve count |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("count"), error, error_prefix + "missing key 'count'")); |
| const auto& count = value["count"]; |
| NANOARROW_RETURN_NOT_OK( |
| Check(count.is_number_integer(), error, error_prefix + "count must be integer")); |
| array_view->length = count.get<int64_t>(); |
| |
| // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't |
| // support custom type ids for unions but the ArrayView does (otherwise |
| // ArrowArrayFinishBuilding() would work). |
| for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { |
| ArrowBuffer* buffer = ArrowArrayBuffer(array, i); |
| ArrowBufferView* buffer_view = array_view->buffer_views + i; |
| buffer_view->data.as_uint8 = buffer->data; |
| buffer_view->size_bytes = buffer->size_bytes; |
| |
| // If this is a validity buffer, set the null_count |
| if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY && |
| _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) { |
| array_view->null_count = |
| array_view->length - |
| ArrowBitCountSet(buffer_view->data.as_uint8, 0, array_view->length); |
| } |
| } |
| |
| // The null type doesn't have any buffers but we can set the null_count |
| if (array_view->storage_type == NANOARROW_TYPE_NA) { |
| array_view->null_count = array_view->length; |
| } |
| |
| // If there is a dictionary associated with schema, parse its value into dictionary |
| if (schema->dictionary != nullptr) { |
| NANOARROW_RETURN_NOT_OK(Check( |
| dictionaries_.HasDictionaryForSchema(schema->dictionary), error, |
| error_prefix + |
| "dictionary could not be resolved from dictionary id in SetArrayColumn()")); |
| |
| const internal::Dictionary& dict = dictionaries_.Get(schema->dictionary); |
| NANOARROW_RETURN_NOT_OK(SetArrayColumn( |
| json::parse(dict.column_json), schema->dictionary, array_view->dictionary, |
| array->dictionary, error, error_prefix + "-> <dictionary> ")); |
| } |
| |
| // Validate the array view |
| NANOARROW_RETURN_NOT_OK(PrefixError( |
| ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, |
| error_prefix + "failed to validate: ")); |
| |
| // Flush length and buffer pointers to the Array. This also ensures that buffers |
| // are not NULL (matters for some versions of some implementations). |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinishBuildingDefault(array, nullptr), |
| error); |
| array->length = array_view->length; |
| array->null_count = array_view->null_count; |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_view, |
| ArrowArray* array, int buffer_i, |
| ArrowError* error) { |
| ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); |
| |
| switch (array_view->layout.buffer_type[buffer_i]) { |
| case NANOARROW_BUFFER_TYPE_VALIDITY: { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("VALIDITY"), error, "missing key 'VALIDITY'")); |
| const auto& validity = value["VALIDITY"]; |
| NANOARROW_RETURN_NOT_OK( |
| SetBufferBitmap(validity, ArrowArrayValidityBitmap(array), error)); |
| break; |
| } |
| case NANOARROW_BUFFER_TYPE_TYPE_ID: { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); |
| const auto& type_id = value["TYPE_ID"]; |
| NANOARROW_RETURN_NOT_OK(SetBufferInt<int8_t>(type_id, buffer, error)); |
| break; |
| } |
| case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); |
| const auto& offset = value["OFFSET"]; |
| NANOARROW_RETURN_NOT_OK(SetBufferInt<int32_t>(offset, buffer, error)); |
| break; |
| } |
| case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); |
| const auto& offset = value["OFFSET"]; |
| |
| if (array_view->layout.element_size_bits[buffer_i] == 32) { |
| NANOARROW_RETURN_NOT_OK(SetBufferInt<int32_t>(offset, buffer, error)); |
| } else { |
| NANOARROW_RETURN_NOT_OK(SetBufferInt<int64_t>(offset, buffer, error)); |
| } |
| break; |
| } |
| |
| case NANOARROW_BUFFER_TYPE_DATA: { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.contains("DATA"), error, "missing key 'DATA'")); |
| const auto& data = value["DATA"]; |
| |
| switch (array_view->storage_type) { |
| case NANOARROW_TYPE_BOOL: { |
| nanoarrow::UniqueBitmap bitmap; |
| NANOARROW_RETURN_NOT_OK(SetBufferBitmap(data, bitmap.get(), error)); |
| ArrowBufferMove(&bitmap->buffer, buffer); |
| return NANOARROW_OK; |
| } |
| case NANOARROW_TYPE_INT8: |
| return SetBufferInt<int8_t>(data, buffer, error); |
| case NANOARROW_TYPE_UINT8: |
| return SetBufferInt<uint8_t>(data, buffer, error); |
| case NANOARROW_TYPE_INT16: |
| return SetBufferInt<int16_t>(data, buffer, error); |
| case NANOARROW_TYPE_UINT16: |
| return SetBufferInt<uint16_t>(data, buffer, error); |
| case NANOARROW_TYPE_INT32: |
| case NANOARROW_TYPE_INTERVAL_MONTHS: |
| return SetBufferInt<int32_t>(data, buffer, error); |
| case NANOARROW_TYPE_UINT32: |
| return SetBufferInt<uint32_t>(data, buffer, error); |
| case NANOARROW_TYPE_INT64: |
| return SetBufferInt<int64_t>(data, buffer, error); |
| case NANOARROW_TYPE_UINT64: |
| return SetBufferInt<uint64_t, uint64_t>(data, buffer, error); |
| |
| case NANOARROW_TYPE_FLOAT: |
| return SetBufferFloatingPoint<float>(data, buffer, error); |
| case NANOARROW_TYPE_DOUBLE: |
| return SetBufferFloatingPoint<double>(data, buffer, error); |
| |
| case NANOARROW_TYPE_STRING: |
| return SetBufferString<int32_t>(data, ArrowArrayBuffer(array, buffer_i - 1), |
| buffer, error); |
| case NANOARROW_TYPE_LARGE_STRING: |
| return SetBufferString<int64_t>(data, ArrowArrayBuffer(array, buffer_i - 1), |
| buffer, error); |
| case NANOARROW_TYPE_BINARY: |
| return SetBufferBinary<int32_t>(data, ArrowArrayBuffer(array, buffer_i - 1), |
| buffer, error); |
| case NANOARROW_TYPE_LARGE_BINARY: |
| return SetBufferBinary<int64_t>(data, ArrowArrayBuffer(array, buffer_i - 1), |
| buffer, error); |
| case NANOARROW_TYPE_FIXED_SIZE_BINARY: |
| return SetBufferFixedSizeBinary( |
| data, buffer, array_view->layout.element_size_bits[buffer_i] / 8, error); |
| case NANOARROW_TYPE_INTERVAL_DAY_TIME: |
| return SetBufferIntervalDayTime(data, buffer, error); |
| case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: |
| return SetBufferIntervalMonthDayNano(data, buffer, error); |
| case NANOARROW_TYPE_DECIMAL128: |
| return SetBufferDecimal(data, buffer, 128, error); |
| case NANOARROW_TYPE_DECIMAL256: |
| return SetBufferDecimal(data, buffer, 256, error); |
| default: |
| ArrowErrorSet(error, "storage type %s DATA buffer not supported", |
| ArrowTypeString(array_view->storage_type)); |
| return ENOTSUP; |
| } |
| break; |
| } |
| case NANOARROW_BUFFER_TYPE_NONE: |
| break; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "bitmap buffer must be array")); |
| |
| // Reserving with the exact length ensures that the last bits are always zeroed. |
| // This was an assumption made by the C# implementation at the time this was |
| // implemented. |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapReserve(bitmap, value.size()), error); |
| |
| for (const auto& item : value) { |
| // Some example files write bitmaps as [true, false, true] but the documentation |
| // says [1, 0, 1]. Accept both for simplicity. |
| NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, |
| "bitmap item must be bool or integer")); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get<int>(), 1), |
| error); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| template <typename T, typename BiggerT = int64_t> |
| ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); |
| |
| for (const auto& item : value) { |
| // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args |
| ArrowErrorCode result = SetBufferIntItem<T, BiggerT>(item, buffer, error); |
| NANOARROW_RETURN_NOT_OK(result); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| template <typename T, typename BiggerT = int64_t> |
| ArrowErrorCode SetBufferIntItem(const json& item, ArrowBuffer* buffer, |
| ArrowError* error) { |
| if (item.is_string()) { |
| try { |
| // The JSON parser here can handle up to 2^64 - 1 |
| auto item_int = json::parse(item.get<std::string>()); |
| return SetBufferIntItem<T, BiggerT>(item_int, buffer, error); |
| } catch (json::parse_error&) { |
| ArrowErrorSet(error, |
| "integer buffer item encoded as string must parse as integer: %s", |
| item.dump().c_str()); |
| return EINVAL; |
| } |
| } |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_number_integer(), error, |
| "integer buffer item must be integer number or string")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(std::numeric_limits<T>::is_signed || item.is_number_unsigned(), error, |
| "expected unsigned integer buffer item but found signed integer '" + |
| item.dump() + "'")); |
| |
| auto item_int = item.get<BiggerT>(); |
| |
| NANOARROW_RETURN_NOT_OK( |
| Check(item_int >= std::numeric_limits<T>::lowest() && |
| item_int <= std::numeric_limits<T>::max(), |
| error, "integer buffer item '" + item.dump() + "' outside type limits")); |
| |
| T buffer_value = static_cast<T>(item_int); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); |
| |
| return NANOARROW_OK; |
| } |
| |
| template <typename T> |
| ArrowErrorCode SetBufferFloatingPoint(const json& value, ArrowBuffer* buffer, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "floatingpoint buffer must be array")); |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_number(), error, "floatingpoint buffer item must be number")); |
| double item_dbl = item.get<double>(); |
| |
| NANOARROW_RETURN_NOT_OK(Check( |
| item_dbl >= std::numeric_limits<T>::lowest() && |
| item_dbl <= std::numeric_limits<T>::max(), |
| error, "floatingpoint buffer item '" + item.dump() + "' outside type limits")); |
| |
| T buffer_value = static_cast<T>(item_dbl); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| template <typename T> |
| ArrowErrorCode SetBufferString(const json& value, ArrowBuffer* offsets, |
| ArrowBuffer* data, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "utf8 data buffer must be array")); |
| |
| // Check offsets against values |
| const T* expected_offset = reinterpret_cast<const T*>(offsets->data); |
| NANOARROW_RETURN_NOT_OK(Check( |
| static_cast<size_t>(offsets->size_bytes) == ((value.size() + 1) * sizeof(T)), |
| error, |
| "Expected offset buffer with " + std::to_string(value.size()) + " elements")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(*expected_offset++ == 0, error, "first offset must be zero")); |
| |
| int64_t last_offset = 0; |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_string(), error, "utf8 data buffer item must be string")); |
| auto item_str = item.get<std::string>(); |
| |
| // Append data |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowBufferAppend(data, reinterpret_cast<const uint8_t*>(item_str.data()), |
| item_str.size()), |
| error); |
| |
| // Check offset |
| last_offset += item_str.size(); |
| NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, |
| "Expected offset value " + |
| std::to_string(last_offset) + |
| " at utf8 data buffer item " + item.dump())); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| template <typename T> |
| ArrowErrorCode SetBufferBinary(const json& value, ArrowBuffer* offsets, |
| ArrowBuffer* data, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "binary data buffer must be array")); |
| |
| // Check offsets against values if not fixed size |
| const T* expected_offset = reinterpret_cast<const T*>(offsets->data); |
| NANOARROW_RETURN_NOT_OK(Check( |
| static_cast<size_t>(offsets->size_bytes) == ((value.size() + 1) * sizeof(T)), |
| error, |
| "Expected offset buffer with " + std::to_string(value.size()) + " elements")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(*expected_offset++ == 0, error, "first offset must be zero")); |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); |
| |
| // Check offset |
| NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == data->size_bytes, error, |
| "Expected offset value " + |
| std::to_string(data->size_bytes) + |
| " at binary data buffer item " + item.dump())); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetBufferFixedSizeBinary(const json& value, ArrowBuffer* data, |
| int64_t fixed_size, ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "binary data buffer must be array")); |
| |
| int64_t last_offset = 0; |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); |
| int64_t item_size_bytes = data->size_bytes - last_offset; |
| |
| NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, |
| "Expected fixed size binary value of size " + |
| std::to_string(fixed_size) + |
| " at binary data buffer item " + item.dump())); |
| last_offset = data->size_bytes; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode AppendBinaryElement(const json& item, ArrowBuffer* data, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_string(), error, "binary data buffer item must be string")); |
| auto item_str = item.get<std::string>(); |
| |
| size_t item_size_bytes = item_str.size() / 2; |
| NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, |
| "binary data buffer item must have even size")); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), error); |
| for (size_t i = 0; i < item_str.size(); i += 2) { |
| std::string byte_hex = item_str.substr(i, 2); |
| char* end_ptr; |
| uint8_t byte = static_cast<uint8_t>(std::strtoul(byte_hex.data(), &end_ptr, 16)); |
| NANOARROW_RETURN_NOT_OK( |
| Check(end_ptr == (byte_hex.data() + 2), error, |
| "binary data buffer item must contain a valid hex-encoded byte string")); |
| |
| data->data[data->size_bytes] = byte; |
| data->size_bytes++; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetBufferIntervalDayTime(const json& value, ArrowBuffer* buffer, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "interval_day_time buffer must be array")); |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_object(), error, "interval_day_time buffer item must be object")); |
| NANOARROW_RETURN_NOT_OK(Check(item.contains("days"), error, |
| "interval_day_time buffer item missing key 'days'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.contains("milliseconds"), error, |
| "interval_day_time buffer item missing key 'milliseconds'")); |
| |
| NANOARROW_RETURN_NOT_OK(SetBufferIntItem<int32_t>(item["days"], buffer, error)); |
| NANOARROW_RETURN_NOT_OK( |
| SetBufferIntItem<int32_t>(item["milliseconds"], buffer, error)); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetBufferIntervalMonthDayNano(const json& value, ArrowBuffer* buffer, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "interval buffer must be array")); |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_object(), error, "interval buffer item must be object")); |
| NANOARROW_RETURN_NOT_OK(Check(item.contains("months"), error, |
| "interval buffer item missing key 'months'")); |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.contains("days"), error, "interval buffer item missing key 'days'")); |
| NANOARROW_RETURN_NOT_OK(Check(item.contains("nanoseconds"), error, |
| "interval buffer item missing key 'nanoseconds'")); |
| |
| NANOARROW_RETURN_NOT_OK(SetBufferIntItem<int32_t>(item["months"], buffer, error)); |
| NANOARROW_RETURN_NOT_OK(SetBufferIntItem<int32_t>(item["days"], buffer, error)); |
| NANOARROW_RETURN_NOT_OK( |
| SetBufferIntItem<int64_t>(item["nanoseconds"], buffer, error)); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode SetBufferDecimal(const json& value, ArrowBuffer* buffer, int bitwidth, |
| ArrowError* error) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(value.is_array(), error, "decimal buffer must be array")); |
| |
| ArrowDecimal decimal; |
| ArrowDecimalInit(&decimal, bitwidth, 0, 0); |
| |
| ArrowStringView item_view; |
| |
| for (const auto& item : value) { |
| NANOARROW_RETURN_NOT_OK( |
| Check(item.is_string(), error, "decimal buffer item must be string")); |
| auto item_str = item.get<std::string>(); |
| item_view.data = item_str.data(); |
| item_view.size_bytes = item_str.size(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowDecimalSetDigits(&decimal, item_view), |
| error); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowBufferAppend(buffer, decimal.words, decimal.n_words * sizeof(uint64_t)), |
| error); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| void SetArrayAllocatorRecursive(ArrowArray* array) { |
| for (int i = 0; i < array->n_buffers; i++) { |
| ArrowArrayBuffer(array, i)->allocator = allocator_; |
| } |
| |
| for (int64_t i = 0; i < array->n_children; i++) { |
| SetArrayAllocatorRecursive(array->children[i]); |
| } |
| |
| if (array->dictionary != nullptr) { |
| SetArrayAllocatorRecursive(array->dictionary); |
| } |
| } |
| |
| ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error, |
| const std::string& prefix) { |
| if (value != NANOARROW_OK && error != nullptr) { |
| std::string msg = prefix + error->message; |
| ArrowErrorSet(error, "%s", msg.c_str()); |
| } |
| |
| return value; |
| } |
| |
| ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { |
| if (value) { |
| return NANOARROW_OK; |
| } else { |
| ArrowErrorSet(error, "%s", err.c_str()); |
| return EINVAL; |
| } |
| } |
| }; |
| |
| /// \brief Integration testing comparison utility |
| /// |
| /// Utility to compare ArrowSchema, ArrowArray, and ArrowArrayStream instances. |
| /// This should only be used in the context of integration testing as the |
| /// comparison logic is specific to the integration testing JSON files and |
| /// specification. Notably: |
| /// |
| /// - Map types are considered equal regardless of the child names "entries", |
| /// "key", and "value". |
| /// - Float32 and Float64 values are compared according to their JSON serialization. |
| class TestingJSONComparison { |
| private: |
| // Internal representation of a human-readable inequality |
| struct Difference { |
| std::string path; |
| std::string actual; |
| std::string expected; |
| }; |
| |
| public: |
| TestingJSONComparison() : compare_batch_flags_(true), compare_metadata_order_(true) { |
| // We do our own metadata comparison |
| writer_actual_.set_include_metadata(false); |
| writer_expected_.set_include_metadata(false); |
| } |
| |
| /// \brief Compare top-level RecordBatch flags (e.g., nullability) |
| /// |
| /// Some Arrow implementations export batches as nullable, and some export them as |
| /// non-nullable. Use false to consider these two types of batches as equivalent. |
| void set_compare_batch_flags(bool value) { compare_batch_flags_ = value; } |
| |
| /// \brief Compare metadata order |
| /// |
| /// Some Arrow implementations store metadata using structures (e.g., hash map) that |
| /// reorder metadata items. Use false to consider metadata whose keys/values have |
| /// been reordered as equivalent. |
| void set_compare_metadata_order(bool value) { compare_metadata_order_ = value; } |
| |
| /// \brief Set float precision |
| /// |
| /// The Arrow Integration Testing JSON document states that values should be compared |
| /// to 3 decimal places to avoid floating point serialization issues. Use -1 to specify |
| /// that all decimal places should be used (the default). |
| void set_compare_float_precision(int value) { |
| writer_actual_.set_float_precision(value); |
| writer_expected_.set_float_precision(value); |
| } |
| |
| /// \brief Returns the number of differences found by the previous call |
| int64_t num_differences() const { return differences_.size(); } |
| |
| /// \brief Dump a human-readable summary of differences to out |
| void WriteDifferences(std::ostream& out) { |
| for (const auto& difference : differences_) { |
| out << "Path: " << difference.path << "\n"; |
| out << "- " << difference.actual << "\n"; |
| out << "+ " << difference.expected << "\n"; |
| out << "\n"; |
| } |
| } |
| |
| /// \brief Clear any existing differences |
| void ClearDifferences() { differences_.clear(); } |
| |
| /// \brief Compare a stream of record batches |
| /// |
| /// Compares actual against expected using the following strategy: |
| /// |
| /// - Compares schemas for equality, returning if differences were found |
| /// - Compares pairs of record batches, returning if one stream finished |
| /// before another. |
| /// |
| /// Returns NANOARROW_OK if the comparison ran without error. Callers must |
| /// query num_differences() to obtain the result of the comparison on success. |
| ArrowErrorCode CompareArrayStream(ArrowArrayStream* actual, ArrowArrayStream* expected, |
| ArrowError* error = nullptr) { |
| // Read both schemas |
| nanoarrow::UniqueSchema actual_schema; |
| nanoarrow::UniqueSchema expected_schema; |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayStreamGetSchema(actual, actual_schema.get(), error)); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayStreamGetSchema(expected, expected_schema.get(), error)); |
| |
| // Compare them and return if they are not equal |
| NANOARROW_RETURN_NOT_OK( |
| CompareSchema(expected_schema.get(), actual_schema.get(), error, "Schema")); |
| if (num_differences() > 0) { |
| return NANOARROW_OK; |
| } |
| |
| // Keep a record of the schema to compare batches |
| NANOARROW_RETURN_NOT_OK(SetSchema(expected_schema.get(), error)); |
| |
| int64_t n_batches = -1; |
| nanoarrow::UniqueArray actual_array; |
| nanoarrow::UniqueArray expected_array; |
| do { |
| n_batches++; |
| std::string batch_label = std::string("Batch ") + std::to_string(n_batches); |
| |
| // Read a batch from each stream |
| actual_array.reset(); |
| expected_array.reset(); |
| NANOARROW_RETURN_NOT_OK(ArrowArrayStreamGetNext(actual, actual_array.get(), error)); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayStreamGetNext(expected, expected_array.get(), error)); |
| |
| // Check the finished/unfinished status of both streams |
| if (actual_array->release == nullptr && expected_array->release != nullptr) { |
| differences_.push_back({batch_label, "finished stream", "unfinished stream"}); |
| return NANOARROW_OK; |
| } |
| |
| if (actual_array->release != nullptr && expected_array->release == nullptr) { |
| differences_.push_back({batch_label, "unfinished stream", "finished stream"}); |
| return NANOARROW_OK; |
| } |
| |
| // If both streams are done, break |
| if (actual_array->release == nullptr) { |
| break; |
| } |
| |
| // Compare this batch |
| NANOARROW_RETURN_NOT_OK( |
| CompareBatch(actual_array.get(), expected_array.get(), error, batch_label)); |
| } while (true); |
| |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Compare a top-level ArrowSchema struct |
| /// |
| /// Returns NANOARROW_OK if the comparison ran without error. Callers must |
| /// query num_differences() to obtain the result of the comparison on success. |
| ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema* expected, |
| ArrowError* error = nullptr, |
| const std::string& path = "") { |
| writer_actual_.ResetDictionaries(); |
| writer_expected_.ResetDictionaries(); |
| |
| // Compare the top-level schema "manually" because (1) map type needs special-cased |
| // comparison and (2) it's easier to read the output if differences are separated |
| // by field. |
| ArrowSchemaView actual_view; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaViewInit(&actual_view, actual, nullptr), |
| error); |
| |
| ArrowSchemaView expected_view; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR( |
| ArrowSchemaViewInit(&expected_view, expected, nullptr), error); |
| |
| if (actual_view.type != NANOARROW_TYPE_STRUCT || |
| expected_view.type != NANOARROW_TYPE_STRUCT) { |
| ArrowErrorSet(error, "Top-level schema must be struct"); |
| return EINVAL; |
| } |
| |
| // (Purposefully ignore the name field at the top level) |
| |
| // Compare flags |
| if (compare_batch_flags_ && actual->flags != expected->flags) { |
| differences_.push_back({path, |
| std::string(".flags: ") + std::to_string(actual->flags), |
| std::string(".flags: ") + std::to_string(expected->flags)}); |
| } |
| |
| // Compare children |
| if (actual->n_children != expected->n_children) { |
| differences_.push_back( |
| {path, std::string(".n_children: ") + std::to_string(actual->n_children), |
| std::string(".n_children: ") + std::to_string(expected->n_children)}); |
| } else { |
| for (int64_t i = 0; i < expected->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(CompareField( |
| actual->children[i], expected->children[i], error, |
| path + std::string(".children[") + std::to_string(i) + std::string("]"))); |
| } |
| } |
| |
| // Compare metadata |
| NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, expected->metadata, error, |
| path + std::string(".metadata"))); |
| |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Set the ArrowSchema to be used to for future calls to CompareBatch(). |
| ArrowErrorCode SetSchema(const ArrowSchema* schema, ArrowError* error = nullptr) { |
| schema_.reset(); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(schema, schema_.get()), error); |
| actual_.reset(); |
| expected_.reset(); |
| |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(actual_.get(), schema_.get(), error)); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowArrayViewInitFromSchema(expected_.get(), schema_.get(), error)); |
| |
| if (actual_->storage_type != NANOARROW_TYPE_STRUCT) { |
| ArrowErrorSet(error, "Can't SetSchema() with non-struct"); |
| return EINVAL; |
| } |
| |
| // "Write" the schema using both writers to ensure dictionary ids can be resolved |
| // using the ArrowSchema* pointers from schema_ |
| std::stringstream ss; |
| writer_actual_.ResetDictionaries(); |
| writer_expected_.ResetDictionaries(); |
| writer_actual_.WriteSchema(ss, schema_.get()); |
| writer_expected_.WriteSchema(ss, schema_.get()); |
| |
| return NANOARROW_OK; |
| } |
| |
| /// \brief Compare a top-level ArrowArray struct |
| /// |
| /// Returns NANOARROW_OK if the comparison ran without error. Callers must |
| /// query num_differences() to obtain the result of the comparison on success. |
| ArrowErrorCode CompareBatch(const ArrowArray* actual, const ArrowArray* expected, |
| ArrowError* error = nullptr, const std::string& path = "") { |
| NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(expected_.get(), expected, error)); |
| NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(actual_.get(), actual, error)); |
| |
| if (actual->offset != expected->offset) { |
| differences_.push_back({path, ".offset: " + std::to_string(actual->offset), |
| ".offset: " + std::to_string(expected->offset)}); |
| } |
| |
| if (actual->length != expected->length) { |
| differences_.push_back({path, ".length: " + std::to_string(actual->length), |
| ".length: " + std::to_string(expected->length)}); |
| } |
| |
| // ArrowArrayViewSetArray() ensured that number of children of both match schema |
| for (int64_t i = 0; i < expected_->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(CompareColumn( |
| schema_->children[i], actual_->children[i], expected_->children[i], error, |
| path + std::string(".children[") + std::to_string(i) + "]")); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| private: |
| TestingJSONWriter writer_actual_; |
| TestingJSONWriter writer_expected_; |
| std::vector<Difference> differences_; |
| nanoarrow::UniqueSchema schema_; |
| nanoarrow::UniqueArrayView actual_; |
| nanoarrow::UniqueArrayView expected_; |
| |
| // Comparison options |
| bool compare_batch_flags_; |
| bool compare_metadata_order_; |
| |
| ArrowErrorCode CompareField(ArrowSchema* actual, ArrowSchema* expected, |
| ArrowError* error, const std::string& path = "") { |
| // Preprocess both fields such that map types have canonical names |
| nanoarrow::UniqueSchema actual_copy; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(actual, actual_copy.get()), |
| error); |
| nanoarrow::UniqueSchema expected_copy; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaDeepCopy(expected, expected_copy.get()), |
| error); |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(actual_copy.get()), error); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ForceMapNamesCanonical(expected_copy.get()), |
| error); |
| return CompareFieldBase(actual_copy.get(), expected_copy.get(), error, path); |
| } |
| |
| ArrowErrorCode CompareFieldBase(ArrowSchema* actual, ArrowSchema* expected, |
| ArrowError* error, const std::string& path = "") { |
| std::stringstream ss; |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss, expected), error); |
| std::string expected_json = ss.str(); |
| |
| ss.str(""); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual), error); |
| std::string actual_json = ss.str(); |
| |
| if (actual_json != expected_json) { |
| differences_.push_back({path, actual_json, expected_json}); |
| } |
| |
| NANOARROW_RETURN_NOT_OK(CompareMetadata(actual->metadata, expected->metadata, error, |
| path + std::string(".metadata"))); |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode CompareMetadata(const char* actual, const char* expected, |
| ArrowError* error, const std::string& path = "") { |
| std::stringstream ss; |
| |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, actual), error); |
| std::string actual_json = ss.str(); |
| |
| ss.str(""); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteMetadata(ss, expected), |
| error); |
| std::string expected_json = ss.str(); |
| |
| bool metadata_equal = actual_json == expected_json; |
| |
| // If there is a difference in the rendered JSON but we aren't being strict about |
| // order, check again using the KeyValue comparison. |
| if (!metadata_equal && !compare_metadata_order_) { |
| NANOARROW_RETURN_NOT_OK( |
| MetadataEqualKeyValue(actual, expected, &metadata_equal, error)); |
| } |
| |
| // If we still have an inequality, add a difference. |
| if (!metadata_equal) { |
| differences_.push_back({path, actual_json, expected_json}); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode MetadataEqualKeyValue(const char* actual, const char* expected, |
| bool* out, ArrowError* error) { |
| std::unordered_map<std::string, std::string> actual_map, expected_map; |
| NANOARROW_RETURN_NOT_OK(MetadataToMap(actual, &actual_map, error)); |
| NANOARROW_RETURN_NOT_OK(MetadataToMap(expected, &expected_map, error)); |
| |
| if (actual_map.size() != expected_map.size()) { |
| *out = false; |
| return NANOARROW_OK; |
| } |
| |
| for (const auto& item : expected_map) { |
| const auto& actual_item = actual_map.find(item.first); |
| if (actual_item == actual_map.end()) { |
| *out = false; |
| return NANOARROW_OK; |
| } |
| |
| if (actual_item->second != item.second) { |
| *out = false; |
| return NANOARROW_OK; |
| } |
| } |
| |
| *out = true; |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode MetadataToMap(const char* metadata, |
| std::unordered_map<std::string, std::string>* out, |
| ArrowError* error) { |
| ArrowMetadataReader reader; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderInit(&reader, metadata), error); |
| |
| ArrowStringView key, value; |
| size_t metadata_num_keys = 0; |
| while (reader.remaining_keys > 0) { |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataReaderRead(&reader, &key, &value), |
| error); |
| out->insert({std::string(key.data, key.size_bytes), |
| std::string(value.data, value.size_bytes)}); |
| metadata_num_keys++; |
| } |
| |
| if (metadata_num_keys != out->size()) { |
| ArrowErrorSet(error, |
| "Comparison of metadata containing duplicate keys without " |
| "considering order is not implemented"); |
| return ENOTSUP; |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual, |
| ArrowArrayView* expected, ArrowError* error, |
| const std::string& path = "") { |
| // Compare children and dictionaries first, then higher-level structures after. |
| // This is a redundant because the higher-level serialized JSON will also report |
| // a difference if deeply nested children have differences; however, it will not |
| // contain dictionaries and this output is slightly better (more targeted differences |
| // that are slightly easier to read appear first). |
| for (int64_t i = 0; i < schema->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK( |
| CompareColumn(schema->children[i], actual->children[i], expected->children[i], |
| error, path + ".children[" + std::to_string(i) + "]")); |
| } |
| |
| if (schema->dictionary != nullptr) { |
| NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary, actual->dictionary, |
| expected->dictionary, error, |
| path + ".dictionary")); |
| } |
| |
| std::stringstream ss; |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, schema, expected), |
| error); |
| std::string expected_json = ss.str(); |
| |
| ss.str(""); |
| NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema, actual), |
| error); |
| std::string actual_json = ss.str(); |
| |
| if (actual_json != expected_json) { |
| differences_.push_back({path, actual_json, expected_json}); |
| } |
| |
| return NANOARROW_OK; |
| } |
| |
| ArrowErrorCode ForceMapNamesCanonical(ArrowSchema* schema) { |
| ArrowSchemaView view; |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); |
| |
| if (view.type == NANOARROW_TYPE_MAP) { |
| NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries")); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowSchemaSetName(schema->children[0]->children[0], "key")); |
| NANOARROW_RETURN_NOT_OK( |
| ArrowSchemaSetName(schema->children[0]->children[1], "value")); |
| } |
| |
| for (int64_t i = 0; i < schema->n_children; i++) { |
| NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->children[i])); |
| } |
| |
| if (schema->dictionary != nullptr) { |
| NANOARROW_RETURN_NOT_OK(ForceMapNamesCanonical(schema->dictionary)); |
| } |
| |
| return NANOARROW_OK; |
| } |
| }; |
| |
| /// @} |
| |
| } // namespace testing |
| } // namespace nanoarrow |
| |
| #endif |