blob: db0eb064ebabe8d63ebefba7ea103023ccb7f91c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <arrow-glib/error.hpp>
#include <arrow-glib/file-system.hpp>
#include <arrow-glib/output-stream.hpp>
#include <arrow-glib/record-batch.hpp>
#include <arrow-glib/reader.hpp>
#include <arrow-glib/schema.hpp>
#include <arrow-dataset-glib/file-format.hpp>
G_BEGIN_DECLS
/**
* SECTION: file-format
* @section_id: file-format
* @title: File format classes
* @include: arrow-dataset-glib/arrow-dataset-glib.h
*
* #GADatasetFileWriteOptions is a class for options to write a file
* of this format.
*
* #GADatasetFileWriter is a class for writing a file of this format.
*
* #GADatasetFileFormat is a base class for file format classes.
*
* #GADatasetCSVFileFormat is a class for CSV file format.
*
* #GADatasetIPCFileFormat is a class for IPC file format.
*
* #GADatasetParquetFileFormat is a class for Parquet file format.
*
* Since: 3.0.0
*/
typedef struct GADatasetFileWriteOptionsPrivate_ {
std::shared_ptr<arrow::dataset::FileWriteOptions> options;
} GADatasetFileWriteOptionsPrivate;
enum {
PROP_OPTIONS = 1,
};
G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriteOptions,
gadataset_file_write_options,
G_TYPE_OBJECT)
#define GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(obj) \
static_cast<GADatasetFileWriteOptionsPrivate *>( \
gadataset_file_write_options_get_instance_private( \
GADATASET_FILE_WRITE_OPTIONS(obj)))
static void
gadataset_file_write_options_finalize(GObject *object)
{
auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
priv->options.~shared_ptr();
G_OBJECT_CLASS(gadataset_file_write_options_parent_class)->finalize(object);
}
static void
gadataset_file_write_options_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
switch (prop_id) {
case PROP_OPTIONS:
priv->options =
*static_cast<std::shared_ptr<arrow::dataset::FileWriteOptions> *>(
g_value_get_pointer(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gadataset_file_write_options_init(GADatasetFileWriteOptions *object)
{
auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(object);
new(&priv->options) std::shared_ptr<arrow::dataset::FileWriteOptions>;
}
static void
gadataset_file_write_options_class_init(GADatasetFileWriteOptionsClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->finalize = gadataset_file_write_options_finalize;
gobject_class->set_property = gadataset_file_write_options_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer("options",
"Options",
"The raw "
"std::shared<arrow::dataset::FileWriteOptions> *",
static_cast<GParamFlags>(G_PARAM_WRITABLE |
G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_OPTIONS, spec);
}
typedef struct GADatasetFileWriterPrivate_ {
std::shared_ptr<arrow::dataset::FileWriter> writer;
} GADatasetFileWriterPrivate;
enum {
PROP_WRITER = 1,
};
G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileWriter,
gadataset_file_writer,
G_TYPE_OBJECT)
#define GADATASET_FILE_WRITER_GET_PRIVATE(obj) \
static_cast<GADatasetFileWriterPrivate *>( \
gadataset_file_writer_get_instance_private( \
GADATASET_FILE_WRITER(obj)))
static void
gadataset_file_writer_finalize(GObject *object)
{
auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
priv->writer.~shared_ptr();
G_OBJECT_CLASS(gadataset_file_writer_parent_class)->finalize(object);
}
static void
gadataset_file_writer_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
switch (prop_id) {
case PROP_WRITER:
priv->writer =
*static_cast<std::shared_ptr<arrow::dataset::FileWriter> *>(
g_value_get_pointer(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gadataset_file_writer_init(GADatasetFileWriter *object)
{
auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(object);
new(&(priv->writer)) std::shared_ptr<arrow::dataset::FileWriter>;
}
static void
gadataset_file_writer_class_init(GADatasetFileWriterClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->finalize = gadataset_file_writer_finalize;
gobject_class->set_property = gadataset_file_writer_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer("writer",
"Writer",
"The raw "
"std::shared<arrow::dataset::FileWriter> *",
static_cast<GParamFlags>(G_PARAM_WRITABLE |
G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_WRITER, spec);
}
/**
* gadataset_file_writer_write_record_batch:
* @writer: A #GADatasetFileWriter.
* @record_batch: A #GArrowRecordBatch to be written.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: %TRUE on success, %FALSE on error.
*
* Since: 6.0.0
*/
gboolean
gadataset_file_writer_write_record_batch(GADatasetFileWriter *writer,
GArrowRecordBatch *record_batch,
GError **error)
{
const auto arrow_writer = gadataset_file_writer_get_raw(writer);
const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
auto status = arrow_writer->Write(arrow_record_batch);
return garrow::check(error, status, "[file-writer][write-record-batch]");
}
/**
* gadataset_file_writer_write_record_batch_reader:
* @writer: A #GADatasetFileWriter.
* @reader: A #GArrowRecordBatchReader to be written.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: %TRUE on success, %FALSE on error.
*
* Since: 6.0.0
*/
gboolean
gadataset_file_writer_write_record_batch_reader(GADatasetFileWriter *writer,
GArrowRecordBatchReader *reader,
GError **error)
{
const auto arrow_writer = gadataset_file_writer_get_raw(writer);
auto arrow_reader = garrow_record_batch_reader_get_raw(reader);
auto status = arrow_writer->Write(arrow_reader.get());
return garrow::check(error,
status,
"[file-writer][write-record-batch-reader]");
}
/**
* gadataset_file_writer_finish:
* @writer: A #GADatasetFileWriter.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: %TRUE on success, %FALSE on error.
*
* Since: 6.0.0
*/
gboolean
gadataset_file_writer_finish(GADatasetFileWriter *writer,
GError **error)
{
const auto arrow_writer = gadataset_file_writer_get_raw(writer);
auto status = arrow_writer->Finish().status();
return garrow::check(error,
status,
"[file-writer][finish]");
}
typedef struct GADatasetFileFormatPrivate_ {
std::shared_ptr<arrow::dataset::FileFormat> format;
} GADatasetFileFormatPrivate;
enum {
PROP_FORMAT = 1,
};
G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat,
gadataset_file_format,
G_TYPE_OBJECT)
#define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \
static_cast<GADatasetFileFormatPrivate *>( \
gadataset_file_format_get_instance_private( \
GADATASET_FILE_FORMAT(obj)))
static void
gadataset_file_format_finalize(GObject *object)
{
auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
priv->format.~shared_ptr();
G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object);
}
static void
gadataset_file_format_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
switch (prop_id) {
case PROP_FORMAT:
priv->format =
*static_cast<std::shared_ptr<arrow::dataset::FileFormat> *>(
g_value_get_pointer(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gadataset_file_format_init(GADatasetFileFormat *object)
{
auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object);
new(&priv->format) std::shared_ptr<arrow::dataset::FileFormat>;
}
static void
gadataset_file_format_class_init(GADatasetFileFormatClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->finalize = gadataset_file_format_finalize;
gobject_class->set_property = gadataset_file_format_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer("format",
"Format",
"The raw std::shared<arrow::dataset::FileFormat> *",
static_cast<GParamFlags>(G_PARAM_WRITABLE |
G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_FORMAT, spec);
}
/**
* gadataset_file_format_get_type_name:
* @format: A #GADatasetFileFormat.
*
* Returns: The type name of @format.
*
* It should be freed with g_free() when no longer needed.
*
* Since: 3.0.0
*/
gchar *
gadataset_file_format_get_type_name(GADatasetFileFormat *format)
{
const auto arrow_format = gadataset_file_format_get_raw(format);
const auto &type_name = arrow_format->type_name();
return g_strndup(type_name.data(), type_name.size());
}
/**
* gadataset_file_format_get_default_write_options:
* @format: A #GADatasetFileFormat.
*
* Returns: (transfer full): The default #GADatasetFileWriteOptions of @format.
*
* Since: 6.0.0
*/
GADatasetFileWriteOptions *
gadataset_file_format_get_default_write_options(GADatasetFileFormat *format)
{
const auto arrow_format = gadataset_file_format_get_raw(format);
auto arrow_options = arrow_format->DefaultWriteOptions();
return gadataset_file_write_options_new_raw(&arrow_options);
}
/**
* gadataset_file_format_open_writer:
* @format: A #GADatasetFileFormat.
* @destination: A #GArrowOutputStream.
* @file_system: The #GArrowFileSystem of @destination.
* @path: The path of @destination.
* @schema: A #GArrowSchema that is used by written record batches.
* @options: A #GADatasetFileWriteOptions.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (transfer full): The newly created #GADatasetFileWriter of @format
* on success, %NULL on error.
*
* Since: 6.0.0
*/
GADatasetFileWriter *
gadataset_file_format_open_writer(GADatasetFileFormat *format,
GArrowOutputStream *destination,
GArrowFileSystem *file_system,
const gchar *path,
GArrowSchema *schema,
GADatasetFileWriteOptions *options,
GError **error)
{
const auto arrow_format = gadataset_file_format_get_raw(format);
auto arrow_destination = garrow_output_stream_get_raw(destination);
auto arrow_file_system = garrow_file_system_get_raw(file_system);
auto arrow_schema = garrow_schema_get_raw(schema);
auto arrow_options = gadataset_file_write_options_get_raw(options);
auto arrow_writer_result =
arrow_format->MakeWriter(arrow_destination,
arrow_schema,
arrow_options,
{arrow_file_system, path});
if (garrow::check(error, arrow_writer_result, "[file-format][open-writer]")) {
auto arrow_writer = *arrow_writer_result;
return gadataset_file_writer_new_raw(&arrow_writer);
} else {
return NULL;
}
}
/**
* gadataset_file_format_equal:
* @format: A #GADatasetFileFormat.
* @other_format: A #GADatasetFileFormat to be compared.
*
* Returns: %TRUE if they are the same content file format, %FALSE otherwise.
*
* Since: 3.0.0
*/
gboolean
gadataset_file_format_equal(GADatasetFileFormat *format,
GADatasetFileFormat *other_format)
{
const auto arrow_format = gadataset_file_format_get_raw(format);
const auto arrow_other_format = gadataset_file_format_get_raw(other_format);
return arrow_format->Equals(*arrow_other_format);
}
G_DEFINE_TYPE(GADatasetCSVFileFormat,
gadataset_csv_file_format,
GADATASET_TYPE_FILE_FORMAT)
static void
gadataset_csv_file_format_init(GADatasetCSVFileFormat *object)
{
}
static void
gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass)
{
}
/**
* gadataset_csv_file_format_new:
*
* Returns: The newly created CSV file format.
*
* Since: 3.0.0
*/
GADatasetCSVFileFormat *
gadataset_csv_file_format_new(void)
{
std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
std::make_shared<arrow::dataset::CsvFileFormat>();
return GADATASET_CSV_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
}
G_DEFINE_TYPE(GADatasetIPCFileFormat,
gadataset_ipc_file_format,
GADATASET_TYPE_FILE_FORMAT)
static void
gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object)
{
}
static void
gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass)
{
}
/**
* gadataset_ipc_file_format_new:
*
* Returns: The newly created IPC file format.
*
* Since: 3.0.0
*/
GADatasetIPCFileFormat *
gadataset_ipc_file_format_new(void)
{
std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
std::make_shared<arrow::dataset::IpcFileFormat>();
return GADATASET_IPC_FILE_FORMAT(gadataset_file_format_new_raw(&arrow_format));
}
G_DEFINE_TYPE(GADatasetParquetFileFormat,
gadataset_parquet_file_format,
GADATASET_TYPE_FILE_FORMAT)
static void
gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object)
{
}
static void
gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass)
{
}
/**
* gadataset_parquet_file_format_new:
*
* Returns: The newly created Parquet file format.
*
* Since: 3.0.0
*/
GADatasetParquetFileFormat *
gadataset_parquet_file_format_new(void)
{
std::shared_ptr<arrow::dataset::FileFormat> arrow_format =
std::make_shared<arrow::dataset::ParquetFileFormat>();
return GADATASET_PARQUET_FILE_FORMAT(
gadataset_file_format_new_raw(&arrow_format));
}
G_END_DECLS
GADatasetFileWriteOptions *
gadataset_file_write_options_new_raw(
std::shared_ptr<arrow::dataset::FileWriteOptions> *arrow_options)
{
return GADATASET_FILE_WRITE_OPTIONS(
g_object_new(GADATASET_TYPE_FILE_WRITE_OPTIONS,
"options", arrow_options,
NULL));
}
std::shared_ptr<arrow::dataset::FileWriteOptions>
gadataset_file_write_options_get_raw(GADatasetFileWriteOptions *options)
{
auto priv = GADATASET_FILE_WRITE_OPTIONS_GET_PRIVATE(options);
return priv->options;
}
GADatasetFileWriter *
gadataset_file_writer_new_raw(
std::shared_ptr<arrow::dataset::FileWriter> *arrow_writer)
{
return GADATASET_FILE_WRITER(g_object_new(GADATASET_TYPE_FILE_WRITER,
"writer", arrow_writer,
NULL));
}
std::shared_ptr<arrow::dataset::FileWriter>
gadataset_file_writer_get_raw(GADatasetFileWriter *writer)
{
auto priv = GADATASET_FILE_WRITER_GET_PRIVATE(writer);
return priv->writer;
}
GADatasetFileFormat *
gadataset_file_format_new_raw(
std::shared_ptr<arrow::dataset::FileFormat> *arrow_format)
{
GType type = GADATASET_TYPE_FILE_FORMAT;
const auto &type_name = (*arrow_format)->type_name();
if (type_name == "csv") {
type = GADATASET_TYPE_CSV_FILE_FORMAT;
} else if (type_name == "ipc") {
type = GADATASET_TYPE_IPC_FILE_FORMAT;
} else if (type_name == "parquet") {
type = GADATASET_TYPE_PARQUET_FILE_FORMAT;
}
return GADATASET_FILE_FORMAT(g_object_new(type,
"format", arrow_format,
NULL));
}
std::shared_ptr<arrow::dataset::FileFormat>
gadataset_file_format_get_raw(GADatasetFileFormat *format)
{
auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(format);
return priv->format;
}