| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| # distutils: language = c++ |
| |
| from libcpp.unordered_map cimport unordered_map |
| |
| from pyarrow.includes.common cimport * |
| from pyarrow.includes.libarrow cimport * |
| from pyarrow.includes.libarrow_fs cimport * |
| from pyarrow._parquet cimport * |
| |
| |
| cdef extern from "arrow/api.h" namespace "arrow" nogil: |
| |
| cdef cppclass CRecordBatchIterator "arrow::RecordBatchIterator"( |
| CIterator[shared_ptr[CRecordBatch]]): |
| pass |
| |
| |
| cdef extern from * namespace "arrow::compute": |
| # inlined from expression_internal.h to avoid |
| # proliferation of #include <unordered_map> |
| """ |
| #include <unordered_map> |
| |
| #include "arrow/type.h" |
| #include "arrow/datum.h" |
| |
| namespace arrow { |
| namespace compute { |
| struct KnownFieldValues { |
| std::unordered_map<FieldRef, Datum, FieldRef::Hash> map; |
| }; |
| } // namespace compute |
| } // namespace arrow |
| """ |
| cdef struct CKnownFieldValues "arrow::compute::KnownFieldValues": |
| unordered_map[CFieldRef, CDatum, CFieldRefHash] map |
| |
| cdef extern from "arrow/compute/exec/expression.h" \ |
| namespace "arrow::compute" nogil: |
| |
| cdef cppclass CExpression "arrow::compute::Expression": |
| c_bool Equals(const CExpression& other) const |
| c_string ToString() const |
| CResult[CExpression] Bind(const CSchema&) |
| |
| cdef CExpression CMakeScalarExpression \ |
| "arrow::compute::literal"(shared_ptr[CScalar] value) |
| |
| cdef CExpression CMakeFieldExpression \ |
| "arrow::compute::field_ref"(c_string name) |
| |
| cdef CExpression CMakeCallExpression \ |
| "arrow::compute::call"(c_string function, |
| vector[CExpression] arguments, |
| shared_ptr[CFunctionOptions] options) |
| |
| cdef CResult[shared_ptr[CBuffer]] CSerializeExpression \ |
| "arrow::compute::Serialize"(const CExpression&) |
| |
| cdef CResult[CExpression] CDeserializeExpression \ |
| "arrow::compute::Deserialize"(shared_ptr[CBuffer]) |
| |
| cdef CResult[CKnownFieldValues] \ |
| CExtractKnownFieldValues "arrow::compute::ExtractKnownFieldValues"( |
| const CExpression& partition_expression) |
| |
| ctypedef CStatus cb_writer_finish_internal(CFileWriter*) |
| ctypedef void cb_writer_finish(dict, CFileWriter*) |
| |
| cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: |
| |
| cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior": |
| ExistingDataBehavior_DELETE_MATCHING" \ |
| arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions" |
| ExistingDataBehavior_OVERWRITE_OR_IGNORE" \ |
| arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore" |
| ExistingDataBehavior_ERROR" \ |
| arrow::dataset::ExistingDataBehavior::kError" |
| |
| cdef cppclass CScanOptions "arrow::dataset::ScanOptions": |
| @staticmethod |
| shared_ptr[CScanOptions] Make(shared_ptr[CSchema] schema) |
| |
| shared_ptr[CSchema] dataset_schema |
| shared_ptr[CSchema] projected_schema |
| |
| cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions": |
| c_string type_name() const |
| |
| ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \ |
| "arrow::dataset::ScanTaskIterator" |
| |
| cdef cppclass CScanTask" arrow::dataset::ScanTask": |
| CResult[CRecordBatchIterator] Execute() |
| |
| cdef cppclass CFragment "arrow::dataset::Fragment": |
| CResult[shared_ptr[CSchema]] ReadPhysicalSchema() |
| CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options) |
| c_bool splittable() const |
| c_string type_name() const |
| const CExpression& partition_expression() const |
| |
| ctypedef vector[shared_ptr[CFragment]] CFragmentVector \ |
| "arrow::dataset::FragmentVector" |
| |
| ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \ |
| "arrow::dataset::FragmentIterator" |
| |
| cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"( |
| CFragment): |
| CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches, |
| CExpression partition_expression) |
| |
| cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch": |
| shared_ptr[CRecordBatch] record_batch |
| shared_ptr[CFragment] fragment |
| |
| ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \ |
| "arrow::dataset::TaggedRecordBatchIterator" |
| |
| cdef cppclass CScanner "arrow::dataset::Scanner": |
| CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions]) |
| CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions]) |
| CResult[CScanTaskIterator] Scan() |
| CResult[CTaggedRecordBatchIterator] ScanBatches() |
| CResult[shared_ptr[CTable]] ToTable() |
| CResult[shared_ptr[CTable]] TakeRows(const CArray& indices) |
| CResult[shared_ptr[CTable]] Head(int64_t num_rows) |
| CResult[int64_t] CountRows() |
| CResult[CFragmentIterator] GetFragments() |
| CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader() |
| const shared_ptr[CScanOptions]& options() |
| |
| cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder": |
| CScannerBuilder(shared_ptr[CDataset], |
| shared_ptr[CScanOptions] scan_options) |
| CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment], |
| shared_ptr[CScanOptions] scan_options) |
| |
| @staticmethod |
| shared_ptr[CScannerBuilder] FromRecordBatchReader( |
| shared_ptr[CRecordBatchReader] reader) |
| CStatus ProjectColumns "Project"(const vector[c_string]& columns) |
| CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns) |
| CStatus Filter(CExpression filter) |
| CStatus UseThreads(c_bool use_threads) |
| CStatus UseAsync(c_bool use_async) |
| CStatus Pool(CMemoryPool* pool) |
| CStatus BatchSize(int64_t batch_size) |
| CStatus FragmentScanOptions( |
| shared_ptr[CFragmentScanOptions] fragment_scan_options) |
| CResult[shared_ptr[CScanner]] Finish() |
| shared_ptr[CSchema] schema() const |
| |
| ctypedef vector[shared_ptr[CDataset]] CDatasetVector \ |
| "arrow::dataset::DatasetVector" |
| |
| cdef cppclass CDataset "arrow::dataset::Dataset": |
| const shared_ptr[CSchema] & schema() |
| CResult[CFragmentIterator] GetFragments() |
| CResult[CFragmentIterator] GetFragments(CExpression predicate) |
| const CExpression & partition_expression() |
| c_string type_name() |
| |
| CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) |
| |
| CResult[shared_ptr[CScannerBuilder]] NewScan() |
| |
| cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"( |
| CDataset): |
| CInMemoryDataset(shared_ptr[CRecordBatchReader]) |
| CInMemoryDataset(shared_ptr[CTable]) |
| |
| cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"( |
| CDataset): |
| @staticmethod |
| CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema, |
| CDatasetVector children) |
| |
| const CDatasetVector& children() const |
| |
| cdef cppclass CInspectOptions "arrow::dataset::InspectOptions": |
| int fragments |
| |
| cdef cppclass CFinishOptions "arrow::dataset::FinishOptions": |
| shared_ptr[CSchema] schema |
| CInspectOptions inspect_options |
| c_bool validate_fragments |
| |
| cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory": |
| CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions) |
| CResult[shared_ptr[CSchema]] Inspect(CInspectOptions) |
| CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"( |
| const shared_ptr[CSchema]& schema) |
| CResult[shared_ptr[CDataset]] Finish() |
| const CExpression& root_partition() |
| CStatus SetRootPartition(CExpression partition) |
| |
| cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory": |
| @staticmethod |
| CResult[shared_ptr[CDatasetFactory]] Make( |
| vector[shared_ptr[CDatasetFactory]] factories) |
| |
| cdef cppclass CFileSource "arrow::dataset::FileSource": |
| const c_string& path() const |
| const shared_ptr[CFileSystem]& filesystem() const |
| const shared_ptr[CBuffer]& buffer() const |
| # HACK: Cython can't handle all the overloads so don't declare them. |
| # This means invalid construction of CFileSource won't be caught in |
| # the C++ generation phase (though it will still be caught when |
| # the generated C++ is compiled). |
| CFileSource(...) |
| |
| cdef cppclass CFileWriteOptions \ |
| "arrow::dataset::FileWriteOptions": |
| const shared_ptr[CFileFormat]& format() const |
| c_string type_name() const |
| |
| cdef cppclass CFileWriter \ |
| "arrow::dataset::FileWriter": |
| const shared_ptr[CFileFormat]& format() const |
| const shared_ptr[CSchema]& schema() const |
| const shared_ptr[CFileWriteOptions]& options() const |
| const CFileLocator& destination() const |
| |
| cdef cppclass CParquetFileWriter \ |
| "arrow::dataset::ParquetFileWriter"(CFileWriter): |
| const shared_ptr[FileWriter]& parquet_writer() const |
| |
| cdef cppclass CFileFormat "arrow::dataset::FileFormat": |
| shared_ptr[CFragmentScanOptions] default_fragment_scan_options |
| c_string type_name() const |
| CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const |
| CResult[shared_ptr[CFileFragment]] MakeFragment( |
| CFileSource source, |
| CExpression partition_expression, |
| shared_ptr[CSchema] physical_schema) |
| shared_ptr[CFileWriteOptions] DefaultWriteOptions() |
| |
| cdef cppclass CFileFragment "arrow::dataset::FileFragment"( |
| CFragment): |
| const CFileSource& source() const |
| const shared_ptr[CFileFormat]& format() const |
| |
| cdef cppclass CParquetFileWriteOptions \ |
| "arrow::dataset::ParquetFileWriteOptions"(CFileWriteOptions): |
| shared_ptr[WriterProperties] writer_properties |
| shared_ptr[ArrowWriterProperties] arrow_writer_properties |
| |
| cdef cppclass CParquetFileFragment "arrow::dataset::ParquetFileFragment"( |
| CFileFragment): |
| const vector[int]& row_groups() const |
| shared_ptr[CFileMetaData] metadata() const |
| CResult[vector[shared_ptr[CFragment]]] SplitByRowGroup( |
| CExpression predicate) |
| CResult[shared_ptr[CFragment]] SubsetWithFilter "Subset"( |
| CExpression predicate) |
| CResult[shared_ptr[CFragment]] SubsetWithIds "Subset"( |
| vector[int] row_group_ids) |
| CStatus EnsureCompleteMetadata() |
| |
| cdef cppclass CFileSystemDatasetWriteOptions \ |
| "arrow::dataset::FileSystemDatasetWriteOptions": |
| shared_ptr[CFileWriteOptions] file_write_options |
| shared_ptr[CFileSystem] filesystem |
| c_string base_dir |
| shared_ptr[CPartitioning] partitioning |
| int max_partitions |
| c_string basename_template |
| function[cb_writer_finish_internal] writer_pre_finish |
| function[cb_writer_finish_internal] writer_post_finish |
| ExistingDataBehavior existing_data_behavior |
| |
| cdef cppclass CFileSystemDataset \ |
| "arrow::dataset::FileSystemDataset"(CDataset): |
| @staticmethod |
| CResult[shared_ptr[CDataset]] Make( |
| shared_ptr[CSchema] schema, |
| CExpression source_partition, |
| shared_ptr[CFileFormat] format, |
| shared_ptr[CFileSystem] filesystem, |
| vector[shared_ptr[CFileFragment]] fragments) |
| |
| @staticmethod |
| CStatus Write( |
| const CFileSystemDatasetWriteOptions& write_options, |
| shared_ptr[CScanner] scanner) |
| |
| c_string type() |
| vector[c_string] files() |
| const shared_ptr[CFileFormat]& format() const |
| const shared_ptr[CFileSystem]& filesystem() const |
| const shared_ptr[CPartitioning]& partitioning() const |
| |
| cdef cppclass CParquetFileFormatReaderOptions \ |
| "arrow::dataset::ParquetFileFormat::ReaderOptions": |
| unordered_set[c_string] dict_columns |
| TimeUnit coerce_int96_timestamp_unit |
| |
| cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"( |
| CFileFormat): |
| CParquetFileFormatReaderOptions reader_options |
| CResult[shared_ptr[CFileFragment]] MakeFragment( |
| CFileSource source, |
| CExpression partition_expression, |
| shared_ptr[CSchema] physical_schema, |
| vector[int] row_groups) |
| |
| cdef cppclass CParquetFragmentScanOptions \ |
| "arrow::dataset::ParquetFragmentScanOptions"(CFragmentScanOptions): |
| shared_ptr[CReaderProperties] reader_properties |
| shared_ptr[ArrowReaderProperties] arrow_reader_properties |
| c_bool enable_parallel_column_conversion |
| |
| cdef cppclass CIpcFileWriteOptions \ |
| "arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): |
| pass |
| |
| cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( |
| CFileFormat): |
| pass |
| |
| cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"( |
| CFileFormat): |
| pass |
| |
| cdef cppclass CCsvFileWriteOptions \ |
| "arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions): |
| shared_ptr[CCSVWriteOptions] write_options |
| CMemoryPool* pool |
| |
| cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"( |
| CFileFormat): |
| CCSVParseOptions parse_options |
| |
| cdef cppclass CCsvFragmentScanOptions \ |
| "arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions): |
| CCSVConvertOptions convert_options |
| CCSVReadOptions read_options |
| |
| cdef cppclass CPartitioning "arrow::dataset::Partitioning": |
| c_string type_name() const |
| CResult[CExpression] Parse(const c_string & path) const |
| const shared_ptr[CSchema] & schema() |
| |
| cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": |
| pass |
| |
| CSegmentEncoding CSegmentEncodingNone\ |
| " arrow::dataset::SegmentEncoding::None" |
| CSegmentEncoding CSegmentEncodingUri\ |
| " arrow::dataset::SegmentEncoding::Uri" |
| |
| cdef cppclass CKeyValuePartitioningOptions \ |
| "arrow::dataset::KeyValuePartitioningOptions": |
| CSegmentEncoding segment_encoding |
| |
| cdef cppclass CHivePartitioningOptions \ |
| "arrow::dataset::HivePartitioningOptions": |
| CSegmentEncoding segment_encoding |
| c_string null_fallback |
| |
| cdef cppclass CPartitioningFactoryOptions \ |
| "arrow::dataset::PartitioningFactoryOptions": |
| c_bool infer_dictionary |
| shared_ptr[CSchema] schema |
| CSegmentEncoding segment_encoding |
| |
| cdef cppclass CHivePartitioningFactoryOptions \ |
| "arrow::dataset::HivePartitioningFactoryOptions": |
| c_bool infer_dictionary |
| c_string null_fallback |
| shared_ptr[CSchema] schema |
| CSegmentEncoding segment_encoding |
| |
| cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": |
| c_string type_name() const |
| |
| cdef cppclass CDirectoryPartitioning \ |
| "arrow::dataset::DirectoryPartitioning"(CPartitioning): |
| CDirectoryPartitioning(shared_ptr[CSchema] schema, |
| vector[shared_ptr[CArray]] dictionaries) |
| |
| @staticmethod |
| shared_ptr[CPartitioningFactory] MakeFactory( |
| vector[c_string] field_names, CPartitioningFactoryOptions) |
| |
| vector[shared_ptr[CArray]] dictionaries() const |
| |
| cdef cppclass CHivePartitioning \ |
| "arrow::dataset::HivePartitioning"(CPartitioning): |
| CHivePartitioning(shared_ptr[CSchema] schema, |
| vector[shared_ptr[CArray]] dictionaries, |
| CHivePartitioningOptions options) |
| |
| @staticmethod |
| shared_ptr[CPartitioningFactory] MakeFactory( |
| CHivePartitioningFactoryOptions) |
| |
| vector[shared_ptr[CArray]] dictionaries() const |
| |
| cdef cppclass CPartitioningOrFactory \ |
| "arrow::dataset::PartitioningOrFactory": |
| CPartitioningOrFactory(shared_ptr[CPartitioning]) |
| CPartitioningOrFactory(shared_ptr[CPartitioningFactory]) |
| CPartitioningOrFactory & operator = (shared_ptr[CPartitioning]) |
| CPartitioningOrFactory & operator = ( |
| shared_ptr[CPartitioningFactory]) |
| shared_ptr[CPartitioning] partitioning() const |
| shared_ptr[CPartitioningFactory] factory() const |
| |
| cdef cppclass CFileSystemFactoryOptions \ |
| "arrow::dataset::FileSystemFactoryOptions": |
| CPartitioningOrFactory partitioning |
| c_string partition_base_dir |
| c_bool exclude_invalid_files |
| vector[c_string] selector_ignore_prefixes |
| |
| cdef cppclass CFileSystemDatasetFactory \ |
| "arrow::dataset::FileSystemDatasetFactory"( |
| CDatasetFactory): |
| @staticmethod |
| CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"( |
| shared_ptr[CFileSystem] filesystem, |
| vector[c_string] paths, |
| shared_ptr[CFileFormat] format, |
| CFileSystemFactoryOptions options |
| ) |
| |
| @staticmethod |
| CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"( |
| shared_ptr[CFileSystem] filesystem, |
| CFileSelector, |
| shared_ptr[CFileFormat] format, |
| CFileSystemFactoryOptions options |
| ) |
| |
| cdef cppclass CParquetFactoryOptions \ |
| "arrow::dataset::ParquetFactoryOptions": |
| CPartitioningOrFactory partitioning |
| c_string partition_base_dir |
| c_bool validate_column_chunk_paths |
| |
| cdef cppclass CParquetDatasetFactory \ |
| "arrow::dataset::ParquetDatasetFactory"(CDatasetFactory): |
| @staticmethod |
| CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataPath "Make"( |
| const c_string& metadata_path, |
| shared_ptr[CFileSystem] filesystem, |
| shared_ptr[CParquetFileFormat] format, |
| CParquetFactoryOptions options |
| ) |
| |
| @staticmethod |
| CResult[shared_ptr[CDatasetFactory]] MakeFromMetaDataSource "Make"( |
| const CFileSource& metadata_path, |
| const c_string& base_path, |
| shared_ptr[CFileSystem] filesystem, |
| shared_ptr[CParquetFileFormat] format, |
| CParquetFactoryOptions options |
| ) |