blob: 7451136cb217b0863f9e1e2fc790492e3b678921 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nanoarrow.h"
static void ArrowSchemaReleaseInternal(struct ArrowSchema* schema) {
if (schema->format != NULL) ArrowFree((void*)schema->format);
if (schema->name != NULL) ArrowFree((void*)schema->name);
if (schema->metadata != NULL) ArrowFree((void*)schema->metadata);
// This object owns the memory for all the children, but those
// children may have been generated elsewhere and might have
// their own release() callback.
if (schema->children != NULL) {
for (int64_t i = 0; i < schema->n_children; i++) {
if (schema->children[i] != NULL) {
if (schema->children[i]->release != NULL) {
ArrowSchemaRelease(schema->children[i]);
}
ArrowFree(schema->children[i]);
}
}
ArrowFree(schema->children);
}
// This object owns the memory for the dictionary but it
// may have been generated somewhere else and have its own
// release() callback.
if (schema->dictionary != NULL) {
if (schema->dictionary->release != NULL) {
ArrowSchemaRelease(schema->dictionary);
}
ArrowFree(schema->dictionary);
}
// private data not currently used
if (schema->private_data != NULL) {
ArrowFree(schema->private_data);
}
schema->release = NULL;
}
static const char* ArrowSchemaFormatTemplate(enum ArrowType type) {
switch (type) {
case NANOARROW_TYPE_UNINITIALIZED:
return NULL;
case NANOARROW_TYPE_NA:
return "n";
case NANOARROW_TYPE_BOOL:
return "b";
case NANOARROW_TYPE_UINT8:
return "C";
case NANOARROW_TYPE_INT8:
return "c";
case NANOARROW_TYPE_UINT16:
return "S";
case NANOARROW_TYPE_INT16:
return "s";
case NANOARROW_TYPE_UINT32:
return "I";
case NANOARROW_TYPE_INT32:
return "i";
case NANOARROW_TYPE_UINT64:
return "L";
case NANOARROW_TYPE_INT64:
return "l";
case NANOARROW_TYPE_HALF_FLOAT:
return "e";
case NANOARROW_TYPE_FLOAT:
return "f";
case NANOARROW_TYPE_DOUBLE:
return "g";
case NANOARROW_TYPE_STRING:
return "u";
case NANOARROW_TYPE_LARGE_STRING:
return "U";
case NANOARROW_TYPE_BINARY:
return "z";
case NANOARROW_TYPE_LARGE_BINARY:
return "Z";
case NANOARROW_TYPE_DATE32:
return "tdD";
case NANOARROW_TYPE_DATE64:
return "tdm";
case NANOARROW_TYPE_INTERVAL_MONTHS:
return "tiM";
case NANOARROW_TYPE_INTERVAL_DAY_TIME:
return "tiD";
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
return "tin";
case NANOARROW_TYPE_LIST:
return "+l";
case NANOARROW_TYPE_LARGE_LIST:
return "+L";
case NANOARROW_TYPE_STRUCT:
return "+s";
case NANOARROW_TYPE_MAP:
return "+m";
default:
return NULL;
}
}
static int ArrowSchemaInitChildrenIfNeeded(struct ArrowSchema* schema,
enum ArrowType type) {
switch (type) {
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_FIXED_SIZE_LIST:
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1));
ArrowSchemaInit(schema->children[0]);
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "item"));
break;
case NANOARROW_TYPE_MAP:
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, 1));
NANOARROW_RETURN_NOT_OK(
ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_STRUCT));
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(schema->children[0], "entries"));
schema->children[0]->flags &= ~ARROW_FLAG_NULLABLE;
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema->children[0], 2));
ArrowSchemaInit(schema->children[0]->children[0]);
ArrowSchemaInit(schema->children[0]->children[1]);
NANOARROW_RETURN_NOT_OK(
ArrowSchemaSetName(schema->children[0]->children[0], "key"));
schema->children[0]->children[0]->flags &= ~ARROW_FLAG_NULLABLE;
NANOARROW_RETURN_NOT_OK(
ArrowSchemaSetName(schema->children[0]->children[1], "value"));
break;
default:
break;
}
return NANOARROW_OK;
}
void ArrowSchemaInit(struct ArrowSchema* schema) {
schema->format = NULL;
schema->name = NULL;
schema->metadata = NULL;
schema->flags = ARROW_FLAG_NULLABLE;
schema->n_children = 0;
schema->children = NULL;
schema->dictionary = NULL;
schema->private_data = NULL;
schema->release = &ArrowSchemaReleaseInternal;
}
ArrowErrorCode ArrowSchemaSetType(struct ArrowSchema* schema, enum ArrowType type) {
// We don't allocate the dictionary because it has to be nullptr
// for non-dictionary-encoded arrays.
// Set the format to a valid format string for type
const char* template_format = ArrowSchemaFormatTemplate(type);
// If type isn't recognized and not explicitly unset
if (template_format == NULL && type != NANOARROW_TYPE_UNINITIALIZED) {
return EINVAL;
}
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, template_format));
// For types with an umabiguous child structure, allocate children
return ArrowSchemaInitChildrenIfNeeded(schema, type);
}
ArrowErrorCode ArrowSchemaSetTypeStruct(struct ArrowSchema* schema, int64_t n_children) {
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(schema, NANOARROW_TYPE_STRUCT));
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children));
for (int64_t i = 0; i < n_children; i++) {
ArrowSchemaInit(schema->children[i]);
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaInitFromType(struct ArrowSchema* schema, enum ArrowType type) {
ArrowSchemaInit(schema);
int result = ArrowSchemaSetType(schema, type);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema);
return result;
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaSetTypeFixedSize(struct ArrowSchema* schema,
enum ArrowType type, int32_t fixed_size) {
if (fixed_size <= 0) {
return EINVAL;
}
char buffer[64];
int n_chars;
switch (type) {
case NANOARROW_TYPE_FIXED_SIZE_BINARY:
n_chars = snprintf(buffer, sizeof(buffer), "w:%d", (int)fixed_size);
break;
case NANOARROW_TYPE_FIXED_SIZE_LIST:
n_chars = snprintf(buffer, sizeof(buffer), "+w:%d", (int)fixed_size);
break;
default:
return EINVAL;
}
if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
return ERANGE;
}
buffer[n_chars] = '\0';
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, buffer));
if (type == NANOARROW_TYPE_FIXED_SIZE_LIST) {
NANOARROW_RETURN_NOT_OK(ArrowSchemaInitChildrenIfNeeded(schema, type));
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaSetTypeDecimal(struct ArrowSchema* schema, enum ArrowType type,
int32_t decimal_precision,
int32_t decimal_scale) {
if (decimal_precision <= 0) {
return EINVAL;
}
char buffer[64];
int n_chars;
switch (type) {
case NANOARROW_TYPE_DECIMAL128:
n_chars =
snprintf(buffer, sizeof(buffer), "d:%d,%d", decimal_precision, decimal_scale);
break;
case NANOARROW_TYPE_DECIMAL256:
n_chars = snprintf(buffer, sizeof(buffer), "d:%d,%d,256", decimal_precision,
decimal_scale);
break;
default:
return EINVAL;
}
if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
return ERANGE;
}
buffer[n_chars] = '\0';
return ArrowSchemaSetFormat(schema, buffer);
}
static const char* ArrowTimeUnitFormatString(enum ArrowTimeUnit time_unit) {
switch (time_unit) {
case NANOARROW_TIME_UNIT_SECOND:
return "s";
case NANOARROW_TIME_UNIT_MILLI:
return "m";
case NANOARROW_TIME_UNIT_MICRO:
return "u";
case NANOARROW_TIME_UNIT_NANO:
return "n";
default:
return NULL;
}
}
ArrowErrorCode ArrowSchemaSetTypeDateTime(struct ArrowSchema* schema, enum ArrowType type,
enum ArrowTimeUnit time_unit,
const char* timezone) {
const char* time_unit_str = ArrowTimeUnitFormatString(time_unit);
if (time_unit_str == NULL) {
return EINVAL;
}
char buffer[128];
int n_chars;
switch (type) {
case NANOARROW_TYPE_TIME32:
if (timezone != NULL) {
return EINVAL;
}
switch (time_unit) {
case NANOARROW_TIME_UNIT_MICRO:
case NANOARROW_TIME_UNIT_NANO:
return EINVAL;
default:
break;
}
n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str);
break;
case NANOARROW_TYPE_TIME64:
if (timezone != NULL) {
return EINVAL;
}
switch (time_unit) {
case NANOARROW_TIME_UNIT_SECOND:
case NANOARROW_TIME_UNIT_MILLI:
return EINVAL;
default:
break;
}
n_chars = snprintf(buffer, sizeof(buffer), "tt%s", time_unit_str);
break;
case NANOARROW_TYPE_TIMESTAMP:
if (timezone == NULL) {
timezone = "";
}
n_chars = snprintf(buffer, sizeof(buffer), "ts%s:%s", time_unit_str, timezone);
break;
case NANOARROW_TYPE_DURATION:
if (timezone != NULL) {
return EINVAL;
}
n_chars = snprintf(buffer, sizeof(buffer), "tD%s", time_unit_str);
break;
default:
return EINVAL;
}
if (((size_t)n_chars) >= sizeof(buffer) || n_chars < 0) {
return ERANGE;
}
buffer[n_chars] = '\0';
return ArrowSchemaSetFormat(schema, buffer);
}
ArrowErrorCode ArrowSchemaSetTypeUnion(struct ArrowSchema* schema, enum ArrowType type,
int64_t n_children) {
if (n_children < 0 || n_children > 127) {
return EINVAL;
}
// Max valid size would be +ud:0,1,...126 = 401 characters + null terminator
char format_out[512];
int64_t format_out_size = 512;
memset(format_out, 0, format_out_size);
int n_chars;
char* format_cursor = format_out;
switch (type) {
case NANOARROW_TYPE_SPARSE_UNION:
n_chars = snprintf(format_cursor, format_out_size, "+us:");
format_cursor += n_chars;
format_out_size -= n_chars;
break;
case NANOARROW_TYPE_DENSE_UNION:
n_chars = snprintf(format_cursor, format_out_size, "+ud:");
format_cursor += n_chars;
format_out_size -= n_chars;
break;
default:
return EINVAL;
}
// Ensure that an encoding error from snprintf() does not result
// in an out-of-bounds access.
if (n_chars < 0) {
return ERANGE;
}
if (n_children > 0) {
n_chars = snprintf(format_cursor, format_out_size, "0");
format_cursor += n_chars;
format_out_size -= n_chars;
for (int64_t i = 1; i < n_children; i++) {
n_chars = snprintf(format_cursor, format_out_size, ",%d", (int)i);
format_cursor += n_chars;
format_out_size -= n_chars;
}
}
// Ensure that an encoding error from snprintf() does not result
// in an out-of-bounds access.
if (n_chars < 0) {
return ERANGE;
}
NANOARROW_RETURN_NOT_OK(ArrowSchemaSetFormat(schema, format_out));
NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateChildren(schema, n_children));
for (int64_t i = 0; i < n_children; i++) {
ArrowSchemaInit(schema->children[i]);
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaSetFormat(struct ArrowSchema* schema, const char* format) {
if (schema->format != NULL) {
ArrowFree((void*)schema->format);
}
if (format != NULL) {
size_t format_size = strlen(format) + 1;
schema->format = (const char*)ArrowMalloc(format_size);
if (schema->format == NULL) {
return ENOMEM;
}
memcpy((void*)schema->format, format, format_size);
} else {
schema->format = NULL;
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaSetName(struct ArrowSchema* schema, const char* name) {
if (schema->name != NULL) {
ArrowFree((void*)schema->name);
}
if (name != NULL) {
size_t name_size = strlen(name) + 1;
schema->name = (const char*)ArrowMalloc(name_size);
if (schema->name == NULL) {
return ENOMEM;
}
memcpy((void*)schema->name, name, name_size);
} else {
schema->name = NULL;
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaSetMetadata(struct ArrowSchema* schema, const char* metadata) {
if (schema->metadata != NULL) {
ArrowFree((void*)schema->metadata);
}
if (metadata != NULL) {
size_t metadata_size = ArrowMetadataSizeOf(metadata);
schema->metadata = (const char*)ArrowMalloc(metadata_size);
if (schema->metadata == NULL) {
return ENOMEM;
}
memcpy((void*)schema->metadata, metadata, metadata_size);
} else {
schema->metadata = NULL;
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaAllocateChildren(struct ArrowSchema* schema,
int64_t n_children) {
if (schema->children != NULL) {
return EEXIST;
}
if (n_children > 0) {
schema->children =
(struct ArrowSchema**)ArrowMalloc(n_children * sizeof(struct ArrowSchema*));
if (schema->children == NULL) {
return ENOMEM;
}
schema->n_children = n_children;
memset(schema->children, 0, n_children * sizeof(struct ArrowSchema*));
for (int64_t i = 0; i < n_children; i++) {
schema->children[i] = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema));
if (schema->children[i] == NULL) {
return ENOMEM;
}
schema->children[i]->release = NULL;
}
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaAllocateDictionary(struct ArrowSchema* schema) {
if (schema->dictionary != NULL) {
return EEXIST;
}
schema->dictionary = (struct ArrowSchema*)ArrowMalloc(sizeof(struct ArrowSchema));
if (schema->dictionary == NULL) {
return ENOMEM;
}
schema->dictionary->release = NULL;
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaDeepCopy(const struct ArrowSchema* schema,
struct ArrowSchema* schema_out) {
ArrowSchemaInit(schema_out);
int result = ArrowSchemaSetFormat(schema_out, schema->format);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
schema_out->flags = schema->flags;
result = ArrowSchemaSetName(schema_out, schema->name);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
result = ArrowSchemaSetMetadata(schema_out, schema->metadata);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
result = ArrowSchemaAllocateChildren(schema_out, schema->n_children);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
for (int64_t i = 0; i < schema->n_children; i++) {
result = ArrowSchemaDeepCopy(schema->children[i], schema_out->children[i]);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
}
if (schema->dictionary != NULL) {
result = ArrowSchemaAllocateDictionary(schema_out);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
result = ArrowSchemaDeepCopy(schema->dictionary, schema_out->dictionary);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(schema_out);
return result;
}
}
return NANOARROW_OK;
}
static void ArrowSchemaViewSetPrimitive(struct ArrowSchemaView* schema_view,
enum ArrowType type) {
schema_view->type = type;
schema_view->storage_type = type;
}
static ArrowErrorCode ArrowSchemaViewParse(struct ArrowSchemaView* schema_view,
const char* format,
const char** format_end_out,
struct ArrowError* error) {
*format_end_out = format;
// needed for decimal parsing
const char* parse_start;
char* parse_end;
switch (format[0]) {
case 'n':
schema_view->type = NANOARROW_TYPE_NA;
schema_view->storage_type = NANOARROW_TYPE_NA;
*format_end_out = format + 1;
return NANOARROW_OK;
case 'b':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_BOOL);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'c':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT8);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'C':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT8);
*format_end_out = format + 1;
return NANOARROW_OK;
case 's':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT16);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'S':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT16);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'i':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'I':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT32);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'l':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'L':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_UINT64);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'e':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_HALF_FLOAT);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'f':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_FLOAT);
*format_end_out = format + 1;
return NANOARROW_OK;
case 'g':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DOUBLE);
*format_end_out = format + 1;
return NANOARROW_OK;
// decimal
case 'd':
if (format[1] != ':' || format[2] == '\0') {
ArrowErrorSet(error, "Expected ':precision,scale[,bitwidth]' following 'd'");
return EINVAL;
}
parse_start = format + 2;
schema_view->decimal_precision = (int32_t)strtol(parse_start, &parse_end, 10);
if (parse_end == parse_start || parse_end[0] != ',') {
ArrowErrorSet(error, "Expected 'precision,scale[,bitwidth]' following 'd:'");
return EINVAL;
}
parse_start = parse_end + 1;
schema_view->decimal_scale = (int32_t)strtol(parse_start, &parse_end, 10);
if (parse_end == parse_start) {
ArrowErrorSet(error, "Expected 'scale[,bitwidth]' following 'd:precision,'");
return EINVAL;
} else if (parse_end[0] != ',') {
schema_view->decimal_bitwidth = 128;
} else {
parse_start = parse_end + 1;
schema_view->decimal_bitwidth = (int32_t)strtol(parse_start, &parse_end, 10);
if (parse_start == parse_end) {
ArrowErrorSet(error, "Expected precision following 'd:precision,scale,'");
return EINVAL;
}
}
*format_end_out = parse_end;
switch (schema_view->decimal_bitwidth) {
case 128:
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL128);
return NANOARROW_OK;
case 256:
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_DECIMAL256);
return NANOARROW_OK;
default:
ArrowErrorSet(error, "Expected decimal bitwidth of 128 or 256 but found %d",
(int)schema_view->decimal_bitwidth);
return EINVAL;
}
// validity + data
case 'w':
schema_view->type = NANOARROW_TYPE_FIXED_SIZE_BINARY;
schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_BINARY;
if (format[1] != ':' || format[2] == '\0') {
ArrowErrorSet(error, "Expected ':<width>' following 'w'");
return EINVAL;
}
schema_view->fixed_size = (int32_t)strtol(format + 2, (char**)format_end_out, 10);
return NANOARROW_OK;
// validity + offset + data
case 'z':
schema_view->type = NANOARROW_TYPE_BINARY;
schema_view->storage_type = NANOARROW_TYPE_BINARY;
*format_end_out = format + 1;
return NANOARROW_OK;
case 'u':
schema_view->type = NANOARROW_TYPE_STRING;
schema_view->storage_type = NANOARROW_TYPE_STRING;
*format_end_out = format + 1;
return NANOARROW_OK;
// validity + large_offset + data
case 'Z':
schema_view->type = NANOARROW_TYPE_LARGE_BINARY;
schema_view->storage_type = NANOARROW_TYPE_LARGE_BINARY;
*format_end_out = format + 1;
return NANOARROW_OK;
case 'U':
schema_view->type = NANOARROW_TYPE_LARGE_STRING;
schema_view->storage_type = NANOARROW_TYPE_LARGE_STRING;
*format_end_out = format + 1;
return NANOARROW_OK;
// nested types
case '+':
switch (format[1]) {
// list has validity + offset or offset
case 'l':
schema_view->storage_type = NANOARROW_TYPE_LIST;
schema_view->type = NANOARROW_TYPE_LIST;
*format_end_out = format + 2;
return NANOARROW_OK;
// large list has validity + large_offset or large_offset
case 'L':
schema_view->storage_type = NANOARROW_TYPE_LARGE_LIST;
schema_view->type = NANOARROW_TYPE_LARGE_LIST;
*format_end_out = format + 2;
return NANOARROW_OK;
// just validity buffer
case 'w':
if (format[2] != ':' || format[3] == '\0') {
ArrowErrorSet(error, "Expected ':<width>' following '+w'");
return EINVAL;
}
schema_view->storage_type = NANOARROW_TYPE_FIXED_SIZE_LIST;
schema_view->type = NANOARROW_TYPE_FIXED_SIZE_LIST;
schema_view->fixed_size =
(int32_t)strtol(format + 3, (char**)format_end_out, 10);
return NANOARROW_OK;
case 's':
schema_view->storage_type = NANOARROW_TYPE_STRUCT;
schema_view->type = NANOARROW_TYPE_STRUCT;
*format_end_out = format + 2;
return NANOARROW_OK;
case 'm':
schema_view->storage_type = NANOARROW_TYPE_MAP;
schema_view->type = NANOARROW_TYPE_MAP;
*format_end_out = format + 2;
return NANOARROW_OK;
// unions
case 'u':
switch (format[2]) {
case 'd':
schema_view->storage_type = NANOARROW_TYPE_DENSE_UNION;
schema_view->type = NANOARROW_TYPE_DENSE_UNION;
break;
case 's':
schema_view->storage_type = NANOARROW_TYPE_SPARSE_UNION;
schema_view->type = NANOARROW_TYPE_SPARSE_UNION;
break;
default:
ArrowErrorSet(error,
"Expected union format string +us:<type_ids> or "
"+ud:<type_ids> but found '%s'",
format);
return EINVAL;
}
if (format[3] == ':') {
schema_view->union_type_ids = format + 4;
int64_t n_type_ids =
_ArrowParseUnionTypeIds(schema_view->union_type_ids, NULL);
if (n_type_ids != schema_view->schema->n_children) {
ArrowErrorSet(
error,
"Expected union type_ids parameter to be a comma-separated list of %ld "
"values between 0 and 127 but found '%s'",
(long)schema_view->schema->n_children, schema_view->union_type_ids);
return EINVAL;
}
*format_end_out = format + strlen(format);
return NANOARROW_OK;
} else {
ArrowErrorSet(error,
"Expected union format string +us:<type_ids> or +ud:<type_ids> "
"but found '%s'",
format);
return EINVAL;
}
default:
ArrowErrorSet(error, "Expected nested type format string but found '%s'",
format);
return EINVAL;
}
// date/time types
case 't':
switch (format[1]) {
// date
case 'd':
switch (format[2]) {
case 'D':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32);
schema_view->type = NANOARROW_TYPE_DATE32;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'm':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_DATE64;
*format_end_out = format + 3;
return NANOARROW_OK;
default:
ArrowErrorSet(error, "Expected 'D' or 'm' following 'td' but found '%s'",
format + 2);
return EINVAL;
}
// time of day
case 't':
switch (format[2]) {
case 's':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32);
schema_view->type = NANOARROW_TYPE_TIME32;
schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'm':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT32);
schema_view->type = NANOARROW_TYPE_TIME32;
schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'u':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIME64;
schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'n':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIME64;
schema_view->time_unit = NANOARROW_TIME_UNIT_NANO;
*format_end_out = format + 3;
return NANOARROW_OK;
default:
ArrowErrorSet(
error, "Expected 's', 'm', 'u', or 'n' following 'tt' but found '%s'",
format + 2);
return EINVAL;
}
// timestamp
case 's':
switch (format[2]) {
case 's':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIMESTAMP;
schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND;
break;
case 'm':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIMESTAMP;
schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI;
break;
case 'u':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIMESTAMP;
schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO;
break;
case 'n':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_TIMESTAMP;
schema_view->time_unit = NANOARROW_TIME_UNIT_NANO;
break;
default:
ArrowErrorSet(
error, "Expected 's', 'm', 'u', or 'n' following 'ts' but found '%s'",
format + 2);
return EINVAL;
}
if (format[3] != ':') {
ArrowErrorSet(error, "Expected ':' following '%.3s' but found '%s'", format,
format + 3);
return EINVAL;
}
schema_view->timezone = format + 4;
*format_end_out = format + strlen(format);
return NANOARROW_OK;
// duration
case 'D':
switch (format[2]) {
case 's':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_DURATION;
schema_view->time_unit = NANOARROW_TIME_UNIT_SECOND;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'm':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_DURATION;
schema_view->time_unit = NANOARROW_TIME_UNIT_MILLI;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'u':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_DURATION;
schema_view->time_unit = NANOARROW_TIME_UNIT_MICRO;
*format_end_out = format + 3;
return NANOARROW_OK;
case 'n':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INT64);
schema_view->type = NANOARROW_TYPE_DURATION;
schema_view->time_unit = NANOARROW_TIME_UNIT_NANO;
*format_end_out = format + 3;
return NANOARROW_OK;
default:
ArrowErrorSet(error,
"Expected 's', 'm', u', or 'n' following 'tD' but found '%s'",
format + 2);
return EINVAL;
}
// interval
case 'i':
switch (format[2]) {
case 'M':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_MONTHS);
*format_end_out = format + 3;
return NANOARROW_OK;
case 'D':
ArrowSchemaViewSetPrimitive(schema_view, NANOARROW_TYPE_INTERVAL_DAY_TIME);
*format_end_out = format + 3;
return NANOARROW_OK;
case 'n':
ArrowSchemaViewSetPrimitive(schema_view,
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO);
*format_end_out = format + 3;
return NANOARROW_OK;
default:
ArrowErrorSet(error,
"Expected 'M', 'D', or 'n' following 'ti' but found '%s'",
format + 2);
return EINVAL;
}
default:
ArrowErrorSet(
error, "Expected 'd', 't', 's', 'D', or 'i' following 't' but found '%s'",
format + 1);
return EINVAL;
}
default:
ArrowErrorSet(error, "Unknown format: '%s'", format);
return EINVAL;
}
}
static ArrowErrorCode ArrowSchemaViewValidateNChildren(
struct ArrowSchemaView* schema_view, int64_t n_children, struct ArrowError* error) {
if (n_children != -1 && schema_view->schema->n_children != n_children) {
ArrowErrorSet(error, "Expected schema with %d children but found %d children",
(int)n_children, (int)schema_view->schema->n_children);
return EINVAL;
}
// Don't do a full validation of children but do check that they won't
// segfault if inspected
struct ArrowSchema* child;
for (int64_t i = 0; i < schema_view->schema->n_children; i++) {
child = schema_view->schema->children[i];
if (child == NULL) {
ArrowErrorSet(error,
"Expected valid schema at schema->children[%ld] but found NULL",
(long)i);
return EINVAL;
} else if (child->release == NULL) {
ArrowErrorSet(
error,
"Expected valid schema at schema->children[%ld] but found a released schema",
(long)i);
return EINVAL;
}
}
return NANOARROW_OK;
}
static ArrowErrorCode ArrowSchemaViewValidateUnion(struct ArrowSchemaView* schema_view,
struct ArrowError* error) {
return ArrowSchemaViewValidateNChildren(schema_view, -1, error);
}
static ArrowErrorCode ArrowSchemaViewValidateMap(struct ArrowSchemaView* schema_view,
struct ArrowError* error) {
NANOARROW_RETURN_NOT_OK(ArrowSchemaViewValidateNChildren(schema_view, 1, error));
if (schema_view->schema->children[0]->n_children != 2) {
ArrowErrorSet(error, "Expected child of map type to have 2 children but found %d",
(int)schema_view->schema->children[0]->n_children);
return EINVAL;
}
if (strcmp(schema_view->schema->children[0]->format, "+s") != 0) {
ArrowErrorSet(error, "Expected format of child of map type to be '+s' but found '%s'",
schema_view->schema->children[0]->format);
return EINVAL;
}
if (schema_view->schema->children[0]->flags & ARROW_FLAG_NULLABLE) {
ArrowErrorSet(error,
"Expected child of map type to be non-nullable but was nullable");
return EINVAL;
}
if (schema_view->schema->children[0]->children[0]->flags & ARROW_FLAG_NULLABLE) {
ArrowErrorSet(error, "Expected key of map type to be non-nullable but was nullable");
return EINVAL;
}
return NANOARROW_OK;
}
static ArrowErrorCode ArrowSchemaViewValidateDictionary(
struct ArrowSchemaView* schema_view, struct ArrowError* error) {
// check for valid index type
switch (schema_view->storage_type) {
case NANOARROW_TYPE_UINT8:
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT16:
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT32:
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT64:
case NANOARROW_TYPE_INT64:
break;
default:
ArrowErrorSet(
error,
"Expected dictionary schema index type to be an integral type but found '%s'",
schema_view->schema->format);
return EINVAL;
}
struct ArrowSchemaView dictionary_schema_view;
return ArrowSchemaViewInit(&dictionary_schema_view, schema_view->schema->dictionary,
error);
}
static ArrowErrorCode ArrowSchemaViewValidate(struct ArrowSchemaView* schema_view,
enum ArrowType type,
struct ArrowError* error) {
switch (type) {
case NANOARROW_TYPE_NA:
case NANOARROW_TYPE_BOOL:
case NANOARROW_TYPE_UINT8:
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT16:
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT32:
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT64:
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_HALF_FLOAT:
case NANOARROW_TYPE_FLOAT:
case NANOARROW_TYPE_DOUBLE:
case NANOARROW_TYPE_DECIMAL128:
case NANOARROW_TYPE_DECIMAL256:
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_BINARY:
case NANOARROW_TYPE_LARGE_BINARY:
case NANOARROW_TYPE_DATE32:
case NANOARROW_TYPE_DATE64:
case NANOARROW_TYPE_INTERVAL_MONTHS:
case NANOARROW_TYPE_INTERVAL_DAY_TIME:
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
case NANOARROW_TYPE_TIMESTAMP:
case NANOARROW_TYPE_TIME32:
case NANOARROW_TYPE_TIME64:
case NANOARROW_TYPE_DURATION:
return ArrowSchemaViewValidateNChildren(schema_view, 0, error);
case NANOARROW_TYPE_FIXED_SIZE_BINARY:
if (schema_view->fixed_size <= 0) {
ArrowErrorSet(error, "Expected size > 0 for fixed size binary but found size %d",
schema_view->fixed_size);
return EINVAL;
}
return ArrowSchemaViewValidateNChildren(schema_view, 0, error);
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_FIXED_SIZE_LIST:
return ArrowSchemaViewValidateNChildren(schema_view, 1, error);
case NANOARROW_TYPE_STRUCT:
return ArrowSchemaViewValidateNChildren(schema_view, -1, error);
case NANOARROW_TYPE_SPARSE_UNION:
case NANOARROW_TYPE_DENSE_UNION:
return ArrowSchemaViewValidateUnion(schema_view, error);
case NANOARROW_TYPE_MAP:
return ArrowSchemaViewValidateMap(schema_view, error);
case NANOARROW_TYPE_DICTIONARY:
return ArrowSchemaViewValidateDictionary(schema_view, error);
default:
ArrowErrorSet(error, "Expected a valid enum ArrowType value but found %d",
(int)schema_view->type);
return EINVAL;
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowSchemaViewInit(struct ArrowSchemaView* schema_view,
const struct ArrowSchema* schema,
struct ArrowError* error) {
if (schema == NULL) {
ArrowErrorSet(error, "Expected non-NULL schema");
return EINVAL;
}
if (schema->release == NULL) {
ArrowErrorSet(error, "Expected non-released schema");
return EINVAL;
}
schema_view->schema = schema;
const char* format = schema->format;
if (format == NULL) {
ArrowErrorSet(
error,
"Error parsing schema->format: Expected a null-terminated string but found NULL");
return EINVAL;
}
size_t format_len = strlen(format);
if (format_len == 0) {
ArrowErrorSet(error, "Error parsing schema->format: Expected a string with size > 0");
return EINVAL;
}
const char* format_end_out;
int result = ArrowSchemaViewParse(schema_view, format, &format_end_out, error);
if (result != NANOARROW_OK) {
if (error != NULL) {
char child_error[1024];
memcpy(child_error, ArrowErrorMessage(error), 1024);
ArrowErrorSet(error, "Error parsing schema->format: %s", child_error);
}
return result;
}
if ((format + format_len) != format_end_out) {
ArrowErrorSet(error, "Error parsing schema->format '%s': parsed %d/%d characters",
format, (int)(format_end_out - format), (int)(format_len));
return EINVAL;
}
if (schema->dictionary != NULL) {
schema_view->type = NANOARROW_TYPE_DICTIONARY;
}
NANOARROW_RETURN_NOT_OK(
ArrowSchemaViewValidate(schema_view, schema_view->storage_type, error));
if (schema_view->storage_type != schema_view->type) {
NANOARROW_RETURN_NOT_OK(
ArrowSchemaViewValidate(schema_view, schema_view->type, error));
}
int64_t unknown_flags = schema->flags & ~NANOARROW_FLAG_ALL_SUPPORTED;
if (unknown_flags != 0) {
ArrowErrorSet(error, "Unknown ArrowSchema flag");
return EINVAL;
}
if (schema->flags & ARROW_FLAG_DICTIONARY_ORDERED &&
schema_view->type != NANOARROW_TYPE_DICTIONARY) {
ArrowErrorSet(error,
"ARROW_FLAG_DICTIONARY_ORDERED is only relevant for dictionaries");
return EINVAL;
}
if (schema->flags & ARROW_FLAG_MAP_KEYS_SORTED &&
schema_view->type != NANOARROW_TYPE_MAP) {
ArrowErrorSet(error, "ARROW_FLAG_MAP_KEYS_SORTED is only relevant for a map type");
return EINVAL;
}
ArrowLayoutInit(&schema_view->layout, schema_view->storage_type);
if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) {
schema_view->layout.element_size_bits[1] = schema_view->fixed_size * 8;
} else if (schema_view->storage_type == NANOARROW_TYPE_FIXED_SIZE_LIST) {
schema_view->layout.child_size_elements = schema_view->fixed_size;
}
schema_view->extension_name = ArrowCharView(NULL);
schema_view->extension_metadata = ArrowCharView(NULL);
NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata,
ArrowCharView("ARROW:extension:name"),
&schema_view->extension_name));
NANOARROW_RETURN_NOT_OK(ArrowMetadataGetValue(schema->metadata,
ArrowCharView("ARROW:extension:metadata"),
&schema_view->extension_metadata));
return NANOARROW_OK;
}
static int64_t ArrowSchemaTypeToStringInternal(struct ArrowSchemaView* schema_view,
char* out, int64_t n) {
const char* type_string = ArrowTypeString(schema_view->type);
switch (schema_view->type) {
case NANOARROW_TYPE_DECIMAL128:
case NANOARROW_TYPE_DECIMAL256:
return snprintf(out, n, "%s(%d, %d)", type_string,
(int)schema_view->decimal_precision,
(int)schema_view->decimal_scale);
case NANOARROW_TYPE_TIMESTAMP:
return snprintf(out, n, "%s('%s', '%s')", type_string,
ArrowTimeUnitString(schema_view->time_unit), schema_view->timezone);
case NANOARROW_TYPE_TIME32:
case NANOARROW_TYPE_TIME64:
case NANOARROW_TYPE_DURATION:
return snprintf(out, n, "%s('%s')", type_string,
ArrowTimeUnitString(schema_view->time_unit));
case NANOARROW_TYPE_FIXED_SIZE_BINARY:
case NANOARROW_TYPE_FIXED_SIZE_LIST:
return snprintf(out, n, "%s(%ld)", type_string, (long)schema_view->fixed_size);
case NANOARROW_TYPE_SPARSE_UNION:
case NANOARROW_TYPE_DENSE_UNION:
return snprintf(out, n, "%s([%s])", type_string, schema_view->union_type_ids);
default:
return snprintf(out, n, "%s", type_string);
}
}
// Helper for bookkeeping to emulate sprintf()-like behaviour spread
// among multiple sprintf calls.
static inline void ArrowToStringLogChars(char** out, int64_t n_chars_last,
int64_t* n_remaining, int64_t* n_chars) {
// In the unlikely snprintf() returning a negative value (encoding error),
// ensure the result won't cause an out-of-bounds access.
if (n_chars_last < 0) {
n_chars = 0;
}
*n_chars += n_chars_last;
*n_remaining -= n_chars_last;
// n_remaining is never less than 0
if (*n_remaining < 0) {
*n_remaining = 0;
}
// Can't do math on a NULL pointer
if (*out != NULL) {
*out += n_chars_last;
}
}
int64_t ArrowSchemaToString(const struct ArrowSchema* schema, char* out, int64_t n,
char recursive) {
if (schema == NULL) {
return snprintf(out, n, "[invalid: pointer is null]");
}
if (schema->release == NULL) {
return snprintf(out, n, "[invalid: schema is released]");
}
struct ArrowSchemaView schema_view;
struct ArrowError error;
if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
return snprintf(out, n, "[invalid: %s]", ArrowErrorMessage(&error));
}
// Extension type and dictionary should include both the top-level type
// and the storage type.
int is_extension = schema_view.extension_name.size_bytes > 0;
int is_dictionary = schema->dictionary != NULL;
int64_t n_chars = 0;
int64_t n_chars_last = 0;
// Uncommon but not technically impossible that both are true
if (is_extension && is_dictionary) {
n_chars_last = snprintf(
out, n, "%.*s{dictionary(%s)<", (int)schema_view.extension_name.size_bytes,
schema_view.extension_name.data, ArrowTypeString(schema_view.storage_type));
} else if (is_extension) {
n_chars_last = snprintf(out, n, "%.*s{", (int)schema_view.extension_name.size_bytes,
schema_view.extension_name.data);
} else if (is_dictionary) {
n_chars_last =
snprintf(out, n, "dictionary(%s)<", ArrowTypeString(schema_view.storage_type));
}
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
if (!is_dictionary) {
n_chars_last = ArrowSchemaTypeToStringInternal(&schema_view, out, n);
} else {
n_chars_last = ArrowSchemaToString(schema->dictionary, out, n, recursive);
}
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
if (recursive && schema->format[0] == '+') {
n_chars_last = snprintf(out, n, "<");
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
for (int64_t i = 0; i < schema->n_children; i++) {
if (i > 0) {
n_chars_last = snprintf(out, n, ", ");
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
}
// ArrowSchemaToStringInternal() will validate the child and print the error,
// but we need the name first
if (schema->children[i] != NULL && schema->children[i]->release != NULL &&
schema->children[i]->name != NULL) {
n_chars_last = snprintf(out, n, "%s: ", schema->children[i]->name);
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
}
n_chars_last = ArrowSchemaToString(schema->children[i], out, n, recursive);
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
}
n_chars_last = snprintf(out, n, ">");
ArrowToStringLogChars(&out, n_chars_last, &n, &n_chars);
}
if (is_extension && is_dictionary) {
n_chars += snprintf(out, n, ">}");
} else if (is_extension) {
n_chars += snprintf(out, n, "}");
} else if (is_dictionary) {
n_chars += snprintf(out, n, ">");
}
// Ensure that we always return a positive result
if (n_chars > 0) {
return n_chars;
} else {
return 0;
}
}
ArrowErrorCode ArrowMetadataReaderInit(struct ArrowMetadataReader* reader,
const char* metadata) {
reader->metadata = metadata;
if (reader->metadata == NULL) {
reader->offset = 0;
reader->remaining_keys = 0;
} else {
memcpy(&reader->remaining_keys, reader->metadata, sizeof(int32_t));
reader->offset = sizeof(int32_t);
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowMetadataReaderRead(struct ArrowMetadataReader* reader,
struct ArrowStringView* key_out,
struct ArrowStringView* value_out) {
if (reader->remaining_keys <= 0) {
return EINVAL;
}
int64_t pos = 0;
int32_t key_size;
memcpy(&key_size, reader->metadata + reader->offset + pos, sizeof(int32_t));
pos += sizeof(int32_t);
key_out->data = reader->metadata + reader->offset + pos;
key_out->size_bytes = key_size;
pos += key_size;
int32_t value_size;
memcpy(&value_size, reader->metadata + reader->offset + pos, sizeof(int32_t));
pos += sizeof(int32_t);
value_out->data = reader->metadata + reader->offset + pos;
value_out->size_bytes = value_size;
pos += value_size;
reader->offset += pos;
reader->remaining_keys--;
return NANOARROW_OK;
}
int64_t ArrowMetadataSizeOf(const char* metadata) {
if (metadata == NULL) {
return 0;
}
struct ArrowMetadataReader reader;
struct ArrowStringView key;
struct ArrowStringView value;
if (ArrowMetadataReaderInit(&reader, metadata) != NANOARROW_OK) {
return 0;
}
int64_t size = sizeof(int32_t);
while (ArrowMetadataReaderRead(&reader, &key, &value) == NANOARROW_OK) {
size += sizeof(int32_t) + key.size_bytes + sizeof(int32_t) + value.size_bytes;
}
return size;
}
static ArrowErrorCode ArrowMetadataGetValueInternal(const char* metadata,
struct ArrowStringView* key,
struct ArrowStringView* value_out) {
struct ArrowMetadataReader reader;
struct ArrowStringView existing_key;
struct ArrowStringView existing_value;
NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata));
while (ArrowMetadataReaderRead(&reader, &existing_key, &existing_value) ==
NANOARROW_OK) {
int key_equal = key->size_bytes == existing_key.size_bytes &&
strncmp(key->data, existing_key.data, existing_key.size_bytes) == 0;
if (key_equal) {
value_out->data = existing_value.data;
value_out->size_bytes = existing_value.size_bytes;
break;
}
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowMetadataGetValue(const char* metadata, struct ArrowStringView key,
struct ArrowStringView* value_out) {
if (value_out == NULL) {
return EINVAL;
}
return ArrowMetadataGetValueInternal(metadata, &key, value_out);
}
char ArrowMetadataHasKey(const char* metadata, struct ArrowStringView key) {
struct ArrowStringView value = ArrowCharView(NULL);
if (ArrowMetadataGetValue(metadata, key, &value) != NANOARROW_OK) {
return 0;
}
return value.data != NULL;
}
ArrowErrorCode ArrowMetadataBuilderInit(struct ArrowBuffer* buffer,
const char* metadata) {
ArrowBufferInit(buffer);
return ArrowBufferAppend(buffer, metadata, ArrowMetadataSizeOf(metadata));
}
static ArrowErrorCode ArrowMetadataBuilderAppendInternal(struct ArrowBuffer* buffer,
struct ArrowStringView* key,
struct ArrowStringView* value) {
if (value == NULL) {
return NANOARROW_OK;
}
if (buffer->capacity_bytes == 0) {
NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt32(buffer, 0));
}
if (((size_t)buffer->capacity_bytes) < sizeof(int32_t)) {
return EINVAL;
}
int32_t n_keys;
memcpy(&n_keys, buffer->data, sizeof(int32_t));
int32_t key_size = (int32_t)key->size_bytes;
int32_t value_size = (int32_t)value->size_bytes;
NANOARROW_RETURN_NOT_OK(ArrowBufferReserve(
buffer, sizeof(int32_t) + key_size + sizeof(int32_t) + value_size));
ArrowBufferAppendUnsafe(buffer, &key_size, sizeof(int32_t));
ArrowBufferAppendUnsafe(buffer, key->data, key_size);
ArrowBufferAppendUnsafe(buffer, &value_size, sizeof(int32_t));
ArrowBufferAppendUnsafe(buffer, value->data, value_size);
n_keys++;
memcpy(buffer->data, &n_keys, sizeof(int32_t));
return NANOARROW_OK;
}
static ArrowErrorCode ArrowMetadataBuilderSetInternal(struct ArrowBuffer* buffer,
struct ArrowStringView* key,
struct ArrowStringView* value) {
// Inspect the current value to see if we can avoid copying the buffer
struct ArrowStringView current_value = ArrowCharView(NULL);
NANOARROW_RETURN_NOT_OK(
ArrowMetadataGetValueInternal((const char*)buffer->data, key, &current_value));
// The key should be removed but no key exists
if (value == NULL && current_value.data == NULL) {
return NANOARROW_OK;
}
// The key/value can be appended because no key exists
if (value != NULL && current_value.data == NULL) {
return ArrowMetadataBuilderAppendInternal(buffer, key, value);
}
struct ArrowMetadataReader reader;
struct ArrowStringView existing_key;
struct ArrowStringView existing_value;
NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, (const char*)buffer->data));
struct ArrowBuffer new_buffer;
NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&new_buffer, NULL));
while (reader.remaining_keys > 0) {
int result = ArrowMetadataReaderRead(&reader, &existing_key, &existing_value);
if (result != NANOARROW_OK) {
ArrowBufferReset(&new_buffer);
return result;
}
if (key->size_bytes == existing_key.size_bytes &&
strncmp((const char*)key->data, (const char*)existing_key.data,
existing_key.size_bytes) == 0) {
result = ArrowMetadataBuilderAppendInternal(&new_buffer, key, value);
value = NULL;
} else {
result =
ArrowMetadataBuilderAppendInternal(&new_buffer, &existing_key, &existing_value);
}
if (result != NANOARROW_OK) {
ArrowBufferReset(&new_buffer);
return result;
}
}
ArrowBufferReset(buffer);
ArrowBufferMove(&new_buffer, buffer);
return NANOARROW_OK;
}
ArrowErrorCode ArrowMetadataBuilderAppend(struct ArrowBuffer* buffer,
struct ArrowStringView key,
struct ArrowStringView value) {
return ArrowMetadataBuilderAppendInternal(buffer, &key, &value);
}
ArrowErrorCode ArrowMetadataBuilderSet(struct ArrowBuffer* buffer,
struct ArrowStringView key,
struct ArrowStringView value) {
return ArrowMetadataBuilderSetInternal(buffer, &key, &value);
}
ArrowErrorCode ArrowMetadataBuilderRemove(struct ArrowBuffer* buffer,
struct ArrowStringView key) {
return ArrowMetadataBuilderSetInternal(buffer, &key, NULL);
}