blob: 4fb7b7b1bde8932e69dd987ed4be7987374b60f4 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "nanoarrow.h"
static void ArrowArrayReleaseInternal(struct ArrowArray* array) {
// Release buffers held by this array
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
if (private_data != NULL) {
ArrowBitmapReset(&private_data->bitmap);
ArrowBufferReset(&private_data->buffers[0]);
ArrowBufferReset(&private_data->buffers[1]);
ArrowFree(private_data);
}
// This object owns the memory for all the children, but those
// children may have been generated elsewhere and might have
// their own release() callback.
if (array->children != NULL) {
for (int64_t i = 0; i < array->n_children; i++) {
if (array->children[i] != NULL) {
if (array->children[i]->release != NULL) {
ArrowArrayRelease(array->children[i]);
}
ArrowFree(array->children[i]);
}
}
ArrowFree(array->children);
}
// This object owns the memory for the dictionary but it
// may have been generated somewhere else and have its own
// release() callback.
if (array->dictionary != NULL) {
if (array->dictionary->release != NULL) {
ArrowArrayRelease(array->dictionary);
}
ArrowFree(array->dictionary);
}
// Mark released
array->release = NULL;
}
static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
enum ArrowType storage_type) {
switch (storage_type) {
case NANOARROW_TYPE_UNINITIALIZED:
case NANOARROW_TYPE_NA:
array->n_buffers = 0;
break;
case NANOARROW_TYPE_FIXED_SIZE_LIST:
case NANOARROW_TYPE_STRUCT:
case NANOARROW_TYPE_SPARSE_UNION:
array->n_buffers = 1;
break;
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_MAP:
case NANOARROW_TYPE_BOOL:
case NANOARROW_TYPE_UINT8:
case NANOARROW_TYPE_INT8:
case NANOARROW_TYPE_UINT16:
case NANOARROW_TYPE_INT16:
case NANOARROW_TYPE_UINT32:
case NANOARROW_TYPE_INT32:
case NANOARROW_TYPE_UINT64:
case NANOARROW_TYPE_INT64:
case NANOARROW_TYPE_HALF_FLOAT:
case NANOARROW_TYPE_FLOAT:
case NANOARROW_TYPE_DOUBLE:
case NANOARROW_TYPE_DECIMAL128:
case NANOARROW_TYPE_DECIMAL256:
case NANOARROW_TYPE_INTERVAL_MONTHS:
case NANOARROW_TYPE_INTERVAL_DAY_TIME:
case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO:
case NANOARROW_TYPE_FIXED_SIZE_BINARY:
case NANOARROW_TYPE_DENSE_UNION:
array->n_buffers = 2;
break;
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_BINARY:
case NANOARROW_TYPE_LARGE_BINARY:
array->n_buffers = 3;
break;
default:
return EINVAL;
return NANOARROW_OK;
}
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
private_data->storage_type = storage_type;
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array,
enum ArrowType storage_type) {
array->length = 0;
array->null_count = 0;
array->offset = 0;
array->n_buffers = 0;
array->n_children = 0;
array->buffers = NULL;
array->children = NULL;
array->dictionary = NULL;
array->release = &ArrowArrayReleaseInternal;
array->private_data = NULL;
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)ArrowMalloc(sizeof(struct ArrowArrayPrivateData));
if (private_data == NULL) {
array->release = NULL;
return ENOMEM;
}
ArrowBitmapInit(&private_data->bitmap);
ArrowBufferInit(&private_data->buffers[0]);
ArrowBufferInit(&private_data->buffers[1]);
private_data->buffer_data[0] = NULL;
private_data->buffer_data[1] = NULL;
private_data->buffer_data[2] = NULL;
array->private_data = private_data;
array->buffers = (const void**)(&private_data->buffer_data);
int result = ArrowArraySetStorageType(array, storage_type);
if (result != NANOARROW_OK) {
ArrowArrayRelease(array);
return result;
}
ArrowLayoutInit(&private_data->layout, storage_type);
// We can only know this not to be true when initializing based on a schema
// so assume this to be true.
private_data->union_type_id_is_child_index = 1;
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayInitFromArrayView(struct ArrowArray* array,
const struct ArrowArrayView* array_view,
struct ArrowError* error) {
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowArrayInitFromType(array, array_view->storage_type), error);
int result;
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
private_data->layout = array_view->layout;
if (array_view->n_children > 0) {
result = ArrowArrayAllocateChildren(array, array_view->n_children);
if (result != NANOARROW_OK) {
ArrowArrayRelease(array);
return result;
}
for (int64_t i = 0; i < array_view->n_children; i++) {
result =
ArrowArrayInitFromArrayView(array->children[i], array_view->children[i], error);
if (result != NANOARROW_OK) {
ArrowArrayRelease(array);
return result;
}
}
}
if (array_view->dictionary != NULL) {
result = ArrowArrayAllocateDictionary(array);
if (result != NANOARROW_OK) {
ArrowArrayRelease(array);
return result;
}
result =
ArrowArrayInitFromArrayView(array->dictionary, array_view->dictionary, error);
if (result != NANOARROW_OK) {
ArrowArrayRelease(array);
return result;
}
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayInitFromSchema(struct ArrowArray* array,
const struct ArrowSchema* schema,
struct ArrowError* error) {
struct ArrowArrayView array_view;
NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view, schema, error));
NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromArrayView(array, &array_view, error));
if (array_view.storage_type == NANOARROW_TYPE_DENSE_UNION ||
array_view.storage_type == NANOARROW_TYPE_SPARSE_UNION) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
// We can still build arrays if this isn't true; however, the append
// functions won't work. Instead, we store this value and error only
// when StartAppending is called.
private_data->union_type_id_is_child_index =
_ArrowUnionTypeIdsWillEqualChildIndices(schema->format + 4, schema->n_children);
}
ArrowArrayViewReset(&array_view);
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayAllocateChildren(struct ArrowArray* array, int64_t n_children) {
if (array->children != NULL) {
return EINVAL;
}
if (n_children == 0) {
return NANOARROW_OK;
}
array->children =
(struct ArrowArray**)ArrowMalloc(n_children * sizeof(struct ArrowArray*));
if (array->children == NULL) {
return ENOMEM;
}
memset(array->children, 0, n_children * sizeof(struct ArrowArray*));
for (int64_t i = 0; i < n_children; i++) {
array->children[i] = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray));
if (array->children[i] == NULL) {
return ENOMEM;
}
array->children[i]->release = NULL;
}
array->n_children = n_children;
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayAllocateDictionary(struct ArrowArray* array) {
if (array->dictionary != NULL) {
return EINVAL;
}
array->dictionary = (struct ArrowArray*)ArrowMalloc(sizeof(struct ArrowArray));
if (array->dictionary == NULL) {
return ENOMEM;
}
array->dictionary->release = NULL;
return NANOARROW_OK;
}
void ArrowArraySetValidityBitmap(struct ArrowArray* array, struct ArrowBitmap* bitmap) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
ArrowBufferMove(&bitmap->buffer, &private_data->bitmap.buffer);
private_data->bitmap.size_bits = bitmap->size_bits;
bitmap->size_bits = 0;
private_data->buffer_data[0] = private_data->bitmap.buffer.data;
array->null_count = -1;
}
ArrowErrorCode ArrowArraySetBuffer(struct ArrowArray* array, int64_t i,
struct ArrowBuffer* buffer) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
switch (i) {
case 0:
ArrowBufferMove(buffer, &private_data->bitmap.buffer);
private_data->buffer_data[i] = private_data->bitmap.buffer.data;
break;
case 1:
case 2:
ArrowBufferMove(buffer, &private_data->buffers[i - 1]);
private_data->buffer_data[i] = private_data->buffers[i - 1].data;
break;
default:
return EINVAL;
}
return NANOARROW_OK;
}
static ArrowErrorCode ArrowArrayViewInitFromArray(struct ArrowArrayView* array_view,
struct ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
ArrowArrayViewInitFromType(array_view, private_data->storage_type);
array_view->layout = private_data->layout;
array_view->array = array;
array_view->length = array->length;
array_view->offset = array->offset;
array_view->null_count = array->null_count;
array_view->buffer_views[0].data.as_uint8 = private_data->bitmap.buffer.data;
array_view->buffer_views[0].size_bytes = private_data->bitmap.buffer.size_bytes;
array_view->buffer_views[1].data.as_uint8 = private_data->buffers[0].data;
array_view->buffer_views[1].size_bytes = private_data->buffers[0].size_bytes;
array_view->buffer_views[2].data.as_uint8 = private_data->buffers[1].data;
array_view->buffer_views[2].size_bytes = private_data->buffers[1].size_bytes;
int result = ArrowArrayViewAllocateChildren(array_view, array->n_children);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
for (int64_t i = 0; i < array->n_children; i++) {
result = ArrowArrayViewInitFromArray(array_view->children[i], array->children[i]);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
}
if (array->dictionary != NULL) {
result = ArrowArrayViewAllocateDictionary(array_view);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
result = ArrowArrayViewInitFromArray(array_view->dictionary, array->dictionary);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
}
return NANOARROW_OK;
}
static ArrowErrorCode ArrowArrayReserveInternal(struct ArrowArray* array,
struct ArrowArrayView* array_view) {
// Loop through buffers and reserve the extra space that we know about
for (int64_t i = 0; i < array->n_buffers; i++) {
// Don't reserve on a validity buffer that hasn't been allocated yet
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY &&
ArrowArrayBuffer(array, i)->data == NULL) {
continue;
}
int64_t additional_size_bytes =
array_view->buffer_views[i].size_bytes - ArrowArrayBuffer(array, i)->size_bytes;
if (additional_size_bytes > 0) {
NANOARROW_RETURN_NOT_OK(
ArrowBufferReserve(ArrowArrayBuffer(array, i), additional_size_bytes));
}
}
// Recursively reserve children
for (int64_t i = 0; i < array->n_children; i++) {
NANOARROW_RETURN_NOT_OK(
ArrowArrayReserveInternal(array->children[i], array_view->children[i]));
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayReserve(struct ArrowArray* array,
int64_t additional_size_elements) {
struct ArrowArrayView array_view;
NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromArray(&array_view, array));
// Calculate theoretical buffer sizes (recursively)
ArrowArrayViewSetLength(&array_view, array->length + additional_size_elements);
// Walk the structure (recursively)
int result = ArrowArrayReserveInternal(array, &array_view);
ArrowArrayViewReset(&array_view);
if (result != NANOARROW_OK) {
return result;
}
return NANOARROW_OK;
}
static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY ||
private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
continue;
}
struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i);
if (buffer->data == NULL) {
NANOARROW_RETURN_NOT_OK((ArrowBufferReserve(buffer, 1)));
}
}
for (int64_t i = 0; i < array->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->children[i]));
}
if (array->dictionary != NULL) {
NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary));
}
return NANOARROW_OK;
}
static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) {
struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;
for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data;
}
for (int64_t i = 0; i < array->n_children; i++) {
ArrowArrayFlushInternalPointers(array->children[i]);
}
if (array->dictionary != NULL) {
ArrowArrayFlushInternalPointers(array->dictionary);
}
}
ArrowErrorCode ArrowArrayFinishBuilding(struct ArrowArray* array,
enum ArrowValidationLevel validation_level,
struct ArrowError* error) {
// Even if the data buffer is size zero, the pointer value needed to be non-null
// in some implementations (at least one version of Arrow C++ at the time this
// was added and C# as later discovered). Only do this fix if we can assume
// CPU data access.
if (validation_level >= NANOARROW_VALIDATION_LEVEL_DEFAULT) {
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayFinalizeBuffers(array), error);
}
// Make sure the value we get with array->buffers[i] is set to the actual
// pointer (which may have changed from the original due to reallocation)
ArrowArrayFlushInternalPointers(array);
if (validation_level == NANOARROW_VALIDATION_LEVEL_NONE) {
return NANOARROW_OK;
}
// For validation, initialize an ArrowArrayView with our known buffer sizes
struct ArrowArrayView array_view;
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowArrayViewInitFromArray(&array_view, array),
error);
int result = ArrowArrayViewValidate(&array_view, validation_level, error);
ArrowArrayViewReset(&array_view);
return result;
}
ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array,
struct ArrowError* error) {
return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error);
}
void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view,
enum ArrowType storage_type) {
memset(array_view, 0, sizeof(struct ArrowArrayView));
array_view->storage_type = storage_type;
ArrowLayoutInit(&array_view->layout, storage_type);
}
ArrowErrorCode ArrowArrayViewAllocateChildren(struct ArrowArrayView* array_view,
int64_t n_children) {
if (array_view->children != NULL) {
return EINVAL;
}
array_view->children =
(struct ArrowArrayView**)ArrowMalloc(n_children * sizeof(struct ArrowArrayView*));
if (array_view->children == NULL) {
return ENOMEM;
}
for (int64_t i = 0; i < n_children; i++) {
array_view->children[i] = NULL;
}
array_view->n_children = n_children;
for (int64_t i = 0; i < n_children; i++) {
array_view->children[i] =
(struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
if (array_view->children[i] == NULL) {
return ENOMEM;
}
ArrowArrayViewInitFromType(array_view->children[i], NANOARROW_TYPE_UNINITIALIZED);
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayViewAllocateDictionary(struct ArrowArrayView* array_view) {
if (array_view->dictionary != NULL) {
return EINVAL;
}
array_view->dictionary =
(struct ArrowArrayView*)ArrowMalloc(sizeof(struct ArrowArrayView));
if (array_view->dictionary == NULL) {
return ENOMEM;
}
ArrowArrayViewInitFromType(array_view->dictionary, NANOARROW_TYPE_UNINITIALIZED);
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayViewInitFromSchema(struct ArrowArrayView* array_view,
const struct ArrowSchema* schema,
struct ArrowError* error) {
struct ArrowSchemaView schema_view;
int result = ArrowSchemaViewInit(&schema_view, schema, error);
if (result != NANOARROW_OK) {
return result;
}
ArrowArrayViewInitFromType(array_view, schema_view.storage_type);
array_view->layout = schema_view.layout;
result = ArrowArrayViewAllocateChildren(array_view, schema->n_children);
if (result != NANOARROW_OK) {
ArrowErrorSet(error, "ArrowArrayViewAllocateChildren() failed");
ArrowArrayViewReset(array_view);
return result;
}
for (int64_t i = 0; i < schema->n_children; i++) {
result =
ArrowArrayViewInitFromSchema(array_view->children[i], schema->children[i], error);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
}
if (schema->dictionary != NULL) {
result = ArrowArrayViewAllocateDictionary(array_view);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
result =
ArrowArrayViewInitFromSchema(array_view->dictionary, schema->dictionary, error);
if (result != NANOARROW_OK) {
ArrowArrayViewReset(array_view);
return result;
}
}
if (array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION ||
array_view->storage_type == NANOARROW_TYPE_DENSE_UNION) {
array_view->union_type_id_map = (int8_t*)ArrowMalloc(256 * sizeof(int8_t));
if (array_view->union_type_id_map == NULL) {
return ENOMEM;
}
memset(array_view->union_type_id_map, -1, 256);
int32_t n_type_ids = _ArrowParseUnionTypeIds(schema_view.union_type_ids,
array_view->union_type_id_map + 128);
for (int8_t child_index = 0; child_index < n_type_ids; child_index++) {
int8_t type_id = array_view->union_type_id_map[128 + child_index];
array_view->union_type_id_map[type_id] = child_index;
}
}
return NANOARROW_OK;
}
void ArrowArrayViewReset(struct ArrowArrayView* array_view) {
if (array_view->children != NULL) {
for (int64_t i = 0; i < array_view->n_children; i++) {
if (array_view->children[i] != NULL) {
ArrowArrayViewReset(array_view->children[i]);
ArrowFree(array_view->children[i]);
}
}
ArrowFree(array_view->children);
}
if (array_view->dictionary != NULL) {
ArrowArrayViewReset(array_view->dictionary);
ArrowFree(array_view->dictionary);
}
if (array_view->union_type_id_map != NULL) {
ArrowFree(array_view->union_type_id_map);
}
ArrowArrayViewInitFromType(array_view, NANOARROW_TYPE_UNINITIALIZED);
}
void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) {
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8;
switch (array_view->layout.buffer_type[i]) {
case NANOARROW_BUFFER_TYPE_VALIDITY:
array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length);
continue;
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
// Probably don't want/need to rely on the producer to have allocated an
// offsets buffer of length 1 for a zero-size array
array_view->buffer_views[i].size_bytes =
(length != 0) * element_size_bytes * (length + 1);
continue;
case NANOARROW_BUFFER_TYPE_DATA:
array_view->buffer_views[i].size_bytes =
_ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] * length) /
8;
continue;
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
continue;
case NANOARROW_BUFFER_TYPE_NONE:
array_view->buffer_views[i].size_bytes = 0;
continue;
}
}
switch (array_view->storage_type) {
case NANOARROW_TYPE_STRUCT:
case NANOARROW_TYPE_SPARSE_UNION:
for (int64_t i = 0; i < array_view->n_children; i++) {
ArrowArrayViewSetLength(array_view->children[i], length);
}
break;
case NANOARROW_TYPE_FIXED_SIZE_LIST:
if (array_view->n_children >= 1) {
ArrowArrayViewSetLength(array_view->children[0],
length * array_view->layout.child_size_elements);
}
default:
break;
}
}
// This version recursively extracts information from the array and stores it
// in the array view, performing any checks that require the original array.
static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
const struct ArrowArray* array,
struct ArrowError* error) {
array_view->array = array;
array_view->offset = array->offset;
array_view->length = array->length;
array_view->null_count = array->null_count;
int64_t buffers_required = 0;
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) {
break;
}
buffers_required++;
// Set buffer pointer
array_view->buffer_views[i].data.data = array->buffers[i];
// If non-null, set buffer size to unknown.
if (array->buffers[i] == NULL) {
array_view->buffer_views[i].size_bytes = 0;
} else {
array_view->buffer_views[i].size_bytes = -1;
}
}
// Check the number of buffers
if (buffers_required != array->n_buffers) {
ArrowErrorSet(error, "Expected array with %d buffer(s) but found %d buffer(s)",
(int)buffers_required, (int)array->n_buffers);
return EINVAL;
}
// Check number of children
if (array_view->n_children != array->n_children) {
ArrowErrorSet(error, "Expected %ld children but found %ld children",
(long)array_view->n_children, (long)array->n_children);
return EINVAL;
}
// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view->children[i],
array->children[i], error));
}
// Check dictionary
if (array->dictionary == NULL && array_view->dictionary != NULL) {
ArrowErrorSet(error, "Expected dictionary but found NULL");
return EINVAL;
}
if (array->dictionary != NULL && array_view->dictionary == NULL) {
ArrowErrorSet(error, "Expected NULL dictionary but found dictionary member");
return EINVAL;
}
if (array->dictionary != NULL) {
NANOARROW_RETURN_NOT_OK(
ArrowArrayViewSetArrayInternal(array_view->dictionary, array->dictionary, error));
}
return NANOARROW_OK;
}
static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
struct ArrowError* error) {
if (array_view->length < 0) {
ArrowErrorSet(error, "Expected length >= 0 but found length %ld",
(long)array_view->length);
return EINVAL;
}
if (array_view->offset < 0) {
ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld",
(long)array_view->offset);
return EINVAL;
}
// Calculate buffer sizes that do not require buffer access. If marked as
// unknown, assign the buffer size; otherwise, validate it.
int64_t offset_plus_length = array_view->offset + array_view->length;
// Only loop over the first two buffers because the size of the third buffer
// is always data dependent for all current Arrow types.
for (int i = 0; i < 2; i++) {
int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8;
// Initialize with a value that will cause an error if accidentally used uninitialized
int64_t min_buffer_size_bytes = array_view->buffer_views[i].size_bytes + 1;
switch (array_view->layout.buffer_type[i]) {
case NANOARROW_BUFFER_TYPE_VALIDITY:
if (array_view->null_count == 0 && array_view->buffer_views[i].size_bytes == 0) {
continue;
}
min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length);
break;
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
// Probably don't want/need to rely on the producer to have allocated an
// offsets buffer of length 1 for a zero-size array
min_buffer_size_bytes =
(offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1);
break;
case NANOARROW_BUFFER_TYPE_DATA:
min_buffer_size_bytes =
_ArrowRoundUpToMultipleOf8(array_view->layout.element_size_bits[i] *
offset_plus_length) /
8;
break;
case NANOARROW_BUFFER_TYPE_TYPE_ID:
case NANOARROW_BUFFER_TYPE_UNION_OFFSET:
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
break;
case NANOARROW_BUFFER_TYPE_NONE:
continue;
}
// Assign or validate buffer size
if (array_view->buffer_views[i].size_bytes == -1) {
array_view->buffer_views[i].size_bytes = min_buffer_size_bytes;
} else if (array_view->buffer_views[i].size_bytes < min_buffer_size_bytes) {
ArrowErrorSet(error,
"Expected %s array buffer %d to have size >= %ld bytes but found "
"buffer with %ld bytes",
ArrowTypeString(array_view->storage_type), (int)i,
(long)min_buffer_size_bytes,
(long)array_view->buffer_views[i].size_bytes);
return EINVAL;
}
}
// For list, fixed-size list and map views, we can validate the number of children
switch (array_view->storage_type) {
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_FIXED_SIZE_LIST:
case NANOARROW_TYPE_MAP:
if (array_view->n_children != 1) {
ArrowErrorSet(error, "Expected 1 child of %s array but found %ld child arrays",
ArrowTypeString(array_view->storage_type),
(long)array_view->n_children);
return EINVAL;
}
default:
break;
}
// For struct, the sparse union, and the fixed-size list views, we can validate child
// lengths.
int64_t child_min_length;
switch (array_view->storage_type) {
case NANOARROW_TYPE_SPARSE_UNION:
case NANOARROW_TYPE_STRUCT:
child_min_length = (array_view->offset + array_view->length);
for (int64_t i = 0; i < array_view->n_children; i++) {
if (array_view->children[i]->length < child_min_length) {
ArrowErrorSet(
error,
"Expected struct child %d to have length >= %ld but found child with "
"length %ld",
(int)(i + 1), (long)(child_min_length),
(long)array_view->children[i]->length);
return EINVAL;
}
}
break;
case NANOARROW_TYPE_FIXED_SIZE_LIST:
child_min_length = (array_view->offset + array_view->length) *
array_view->layout.child_size_elements;
if (array_view->children[0]->length < child_min_length) {
ArrowErrorSet(error,
"Expected child of fixed_size_list array to have length >= %ld but "
"found array with length %ld",
(long)child_min_length, (long)array_view->children[0]->length);
return EINVAL;
}
break;
default:
break;
}
// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(
ArrowArrayViewValidateMinimal(array_view->children[i], error));
}
// Recurse for dictionary
if (array_view->dictionary != NULL) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view->dictionary, error));
}
return NANOARROW_OK;
}
static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
struct ArrowError* error) {
// Perform minimal validation. This will validate or assign
// buffer sizes as long as buffer access is not required.
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error));
// Calculate buffer sizes or child lengths that require accessing the offsets
// buffer. Where appropriate, validate that the first offset is >= 0.
// If a buffer size is marked as unknown, assign it; otherwise, validate it.
int64_t offset_plus_length = array_view->offset + array_view->length;
int64_t first_offset;
int64_t last_offset;
switch (array_view->storage_type) {
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_BINARY:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int32[0];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
(long)first_offset);
return EINVAL;
}
last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
// If the data buffer size is unknown, assign it; otherwise, check it
if (array_view->buffer_views[2].size_bytes == -1) {
array_view->buffer_views[2].size_bytes = last_offset;
} else if (array_view->buffer_views[2].size_bytes < last_offset) {
ArrowErrorSet(error,
"Expected %s array buffer 2 to have size >= %ld bytes but found "
"buffer with %ld bytes",
ArrowTypeString(array_view->storage_type), (long)last_offset,
(long)array_view->buffer_views[2].size_bytes);
return EINVAL;
}
} else if (array_view->buffer_views[2].size_bytes == -1) {
// If the data buffer size is unknown and there are no bytes in the offset buffer,
// set the data buffer size to 0.
array_view->buffer_views[2].size_bytes = 0;
}
break;
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_LARGE_BINARY:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int64[0];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
(long)first_offset);
return EINVAL;
}
last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
// If the data buffer size is unknown, assign it; otherwise, check it
if (array_view->buffer_views[2].size_bytes == -1) {
array_view->buffer_views[2].size_bytes = last_offset;
} else if (array_view->buffer_views[2].size_bytes < last_offset) {
ArrowErrorSet(error,
"Expected %s array buffer 2 to have size >= %ld bytes but found "
"buffer with %ld bytes",
ArrowTypeString(array_view->storage_type), (long)last_offset,
(long)array_view->buffer_views[2].size_bytes);
return EINVAL;
}
} else if (array_view->buffer_views[2].size_bytes == -1) {
// If the data buffer size is unknown and there are no bytes in the offset
// buffer, set the data buffer size to 0.
array_view->buffer_views[2].size_bytes = 0;
}
break;
case NANOARROW_TYPE_STRUCT:
for (int64_t i = 0; i < array_view->n_children; i++) {
if (array_view->children[i]->length < offset_plus_length) {
ArrowErrorSet(
error,
"Expected struct child %d to have length >= %ld but found child with "
"length %ld",
(int)(i + 1), (long)offset_plus_length,
(long)array_view->children[i]->length);
return EINVAL;
}
}
break;
case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_MAP:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int32[0];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
(long)first_offset);
return EINVAL;
}
last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
if (array_view->children[0]->length < last_offset) {
ArrowErrorSet(
error,
"Expected child of %s array to have length >= %ld but found array with "
"length %ld",
ArrowTypeString(array_view->storage_type), (long)last_offset,
(long)array_view->children[0]->length);
return EINVAL;
}
}
break;
case NANOARROW_TYPE_LARGE_LIST:
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int64[0];
if (first_offset < 0) {
ArrowErrorSet(error, "Expected first offset >= 0 but found %ld",
(long)first_offset);
return EINVAL;
}
last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
if (array_view->children[0]->length < last_offset) {
ArrowErrorSet(
error,
"Expected child of large list array to have length >= %ld but found array "
"with length %ld",
(long)last_offset, (long)array_view->children[0]->length);
return EINVAL;
}
}
break;
default:
break;
}
// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(
ArrowArrayViewValidateDefault(array_view->children[i], error));
}
// Recurse for dictionary
if (array_view->dictionary != NULL) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view->dictionary, error));
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayViewSetArray(struct ArrowArrayView* array_view,
const struct ArrowArray* array,
struct ArrowError* error) {
// Extract information from the array into the array view
NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error));
// Run default validation. Because we've marked all non-NULL buffers as having unknown
// size, validation will also update the buffer sizes as it goes.
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error));
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayViewSetArrayMinimal(struct ArrowArrayView* array_view,
const struct ArrowArray* array,
struct ArrowError* error) {
// Extract information from the array into the array view
NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArrayInternal(array_view, array, error));
// Run default validation. Because we've marked all non-NULL buffers as having unknown
// size, validation will also update the buffer sizes as it goes.
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateMinimal(array_view, error));
return NANOARROW_OK;
}
static int ArrowAssertIncreasingInt32(struct ArrowBufferView view,
struct ArrowError* error) {
if (view.size_bytes <= (int64_t)sizeof(int32_t)) {
return NANOARROW_OK;
}
for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int32_t); i++) {
if (view.data.as_int32[i] < view.data.as_int32[i - 1]) {
ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i);
return EINVAL;
}
}
return NANOARROW_OK;
}
static int ArrowAssertIncreasingInt64(struct ArrowBufferView view,
struct ArrowError* error) {
if (view.size_bytes <= (int64_t)sizeof(int64_t)) {
return NANOARROW_OK;
}
for (int64_t i = 1; i < view.size_bytes / (int64_t)sizeof(int64_t); i++) {
if (view.data.as_int64[i] < view.data.as_int64[i - 1]) {
ArrowErrorSet(error, "[%ld] Expected element size >= 0", (long)i);
return EINVAL;
}
}
return NANOARROW_OK;
}
static int ArrowAssertRangeInt8(struct ArrowBufferView view, int8_t min_value,
int8_t max_value, struct ArrowError* error) {
for (int64_t i = 0; i < view.size_bytes; i++) {
if (view.data.as_int8[i] < min_value || view.data.as_int8[i] > max_value) {
ArrowErrorSet(error,
"[%ld] Expected buffer value between %d and %d but found value %d",
(long)i, (int)min_value, (int)max_value, (int)view.data.as_int8[i]);
return EINVAL;
}
}
return NANOARROW_OK;
}
static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values,
int64_t n_values, struct ArrowError* error) {
for (int64_t i = 0; i < view.size_bytes; i++) {
int item_found = 0;
for (int64_t j = 0; j < n_values; j++) {
if (view.data.as_int8[i] == values[j]) {
item_found = 1;
break;
}
}
if (!item_found) {
ArrowErrorSet(error, "[%ld] Unexpected buffer value %d", (long)i,
(int)view.data.as_int8[i]);
return EINVAL;
}
}
return NANOARROW_OK;
}
static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
struct ArrowError* error) {
for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
switch (array_view->layout.buffer_type[i]) {
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
if (array_view->layout.element_size_bits[i] == 32) {
NANOARROW_RETURN_NOT_OK(
ArrowAssertIncreasingInt32(array_view->buffer_views[i], error));
} else {
NANOARROW_RETURN_NOT_OK(
ArrowAssertIncreasingInt64(array_view->buffer_views[i], error));
}
break;
default:
break;
}
}
if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION ||
array_view->storage_type == NANOARROW_TYPE_SPARSE_UNION) {
if (array_view->union_type_id_map == NULL) {
// If the union_type_id map is NULL (e.g., when using ArrowArrayInitFromType() +
// ArrowArrayAllocateChildren() + ArrowArrayFinishBuilding()), we don't have enough
// information to validate this buffer.
ArrowErrorSet(error,
"Insufficient information provided for validation of union array");
return EINVAL;
} else if (_ArrowParsedUnionTypeIdsWillEqualChildIndices(
array_view->union_type_id_map, array_view->n_children,
array_view->n_children)) {
NANOARROW_RETURN_NOT_OK(ArrowAssertRangeInt8(
array_view->buffer_views[0], 0, (int8_t)(array_view->n_children - 1), error));
} else {
NANOARROW_RETURN_NOT_OK(ArrowAssertInt8In(array_view->buffer_views[0],
array_view->union_type_id_map + 128,
array_view->n_children, error));
}
}
if (array_view->storage_type == NANOARROW_TYPE_DENSE_UNION &&
array_view->union_type_id_map != NULL) {
// Check that offsets refer to child elements that actually exist
for (int64_t i = 0; i < array_view->length; i++) {
int8_t child_id = ArrowArrayViewUnionChildIndex(array_view, i);
int64_t offset = ArrowArrayViewUnionChildOffset(array_view, i);
int64_t child_length = array_view->children[child_id]->length;
if (offset < 0 || offset > child_length) {
ArrowErrorSet(
error,
"[%ld] Expected union offset for child id %d to be between 0 and %ld but "
"found offset value %ld",
(long)i, (int)child_id, (long)child_length, (long)offset);
return EINVAL;
}
}
}
// Recurse for children
for (int64_t i = 0; i < array_view->n_children; i++) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error));
}
// Dictionary valiation not implemented
if (array_view->dictionary != NULL) {
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error));
// TODO: validate the indices
}
return NANOARROW_OK;
}
ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view,
enum ArrowValidationLevel validation_level,
struct ArrowError* error) {
switch (validation_level) {
case NANOARROW_VALIDATION_LEVEL_NONE:
return NANOARROW_OK;
case NANOARROW_VALIDATION_LEVEL_MINIMAL:
return ArrowArrayViewValidateMinimal(array_view, error);
case NANOARROW_VALIDATION_LEVEL_DEFAULT:
return ArrowArrayViewValidateDefault(array_view, error);
case NANOARROW_VALIDATION_LEVEL_FULL:
NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateDefault(array_view, error));
return ArrowArrayViewValidateFull(array_view, error);
}
ArrowErrorSet(error, "validation_level not recognized");
return EINVAL;
}