cpp/src/parquet/encoding.h - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <vector>

 #include "arrow/type_fwd.h"

 #include "parquet/exception.h"
 #include "parquet/platform.h"
 #include "parquet/types.h"

 namespace arrow {
 template <typename T>
 class Dictionary32Builder;
 }

 namespace parquet {

 template <typename DType>
 class TypedEncoder;

 using BooleanEncoder = TypedEncoder<BooleanType>;
 using Int32Encoder = TypedEncoder<Int32Type>;
 using Int64Encoder = TypedEncoder<Int64Type>;
 using Int96Encoder = TypedEncoder<Int96Type>;
 using FloatEncoder = TypedEncoder<FloatType>;
 using DoubleEncoder = TypedEncoder<DoubleType>;
 using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
 using FLBAEncoder = TypedEncoder<FLBAType>;

 template <typename DType>
 class TypedDecoder;

 class BooleanDecoder;
 using Int32Decoder = TypedDecoder<Int32Type>;
 using Int64Decoder = TypedDecoder<Int64Type>;
 using Int96Decoder = TypedDecoder<Int96Type>;
 using FloatDecoder = TypedDecoder<FloatType>;
 using DoubleDecoder = TypedDecoder<DoubleType>;
 using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
 class FLBADecoder;

 template <typename T>
 struct EncodingTraits;

 template <>
 struct EncodingTraits<BooleanType> {
   using Encoder = BooleanEncoder;
   using Decoder = BooleanDecoder;

   using ArrowType = ::arrow::BooleanType;
   using Accumulator = ::arrow::BooleanBuilder;
   struct DictAccumulator {};
 };

 template <>
 struct EncodingTraits<Int32Type> {
   using Encoder = Int32Encoder;
   using Decoder = Int32Decoder;

   using ArrowType = ::arrow::Int32Type;
   using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
 };

 template <>
 struct EncodingTraits<Int64Type> {
   using Encoder = Int64Encoder;
   using Decoder = Int64Decoder;

   using ArrowType = ::arrow::Int64Type;
   using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
 };

 template <>
 struct EncodingTraits<Int96Type> {
   using Encoder = Int96Encoder;
   using Decoder = Int96Decoder;

   struct Accumulator {};
   struct DictAccumulator {};
 };

 template <>
 struct EncodingTraits<FloatType> {
   using Encoder = FloatEncoder;
   using Decoder = FloatDecoder;

   using ArrowType = ::arrow::FloatType;
   using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
 };

 template <>
 struct EncodingTraits<DoubleType> {
   using Encoder = DoubleEncoder;
   using Decoder = DoubleDecoder;

   using ArrowType = ::arrow::DoubleType;
   using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
 };

 template <>
 struct EncodingTraits<ByteArrayType> {
   using Encoder = ByteArrayEncoder;
   using Decoder = ByteArrayDecoder;

   /// \brief Internal helper class for decoding BYTE_ARRAY data
   ///
   /// This class allows the caller to choose the concrete Arrow data type
   /// by passing a corresponding `ArrayBuilder`.
   /// Supported `ArrayBuilder` classes are `BinaryBuilder`, `LargeBinaryBuilder`
   /// and `BinaryViewBuilder`.
   /// If the builder is a `BinaryBuilder`, `chunks` can accumulate several
   /// arrays as needed to work around the 32-bit offset limit.
   struct Accumulator {
     std::unique_ptr<::arrow::ArrayBuilder> builder;
     std::vector<std::shared_ptr<::arrow::Array>> chunks;
   };
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
 };

 template <>
 struct EncodingTraits<FLBAType> {
   using Encoder = FLBAEncoder;
   using Decoder = FLBADecoder;

   using ArrowType = ::arrow::FixedSizeBinaryType;
   using Accumulator = ::arrow::FixedSizeBinaryBuilder;
   using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
 };

 class ColumnDescriptor;

 // Untyped base for all encoders
 class Encoder {
  public:
   virtual ~Encoder() = default;

   virtual int64_t EstimatedDataEncodedSize() = 0;
   virtual std::shared_ptr<Buffer> FlushValues() = 0;
   virtual Encoding::type encoding() const = 0;

   virtual void Put(const ::arrow::Array& values) = 0;

   // Report the number of bytes written to the encoder since the last report.
   // It only works for BYTE_ARRAY type and throw for other types.
   // This call is not idempotent since it resets the internal counter.
   virtual int64_t ReportUnencodedDataBytes() = 0;

   virtual MemoryPool* memory_pool() const = 0;
 };

 // Base class for value encoders. Since encoders may or not have state (e.g.,
 // dictionary encoding) we use a class instance to maintain any state.
 //
 // Encode interfaces are internal, subject to change without deprecation.
 template <typename DType>
 class TypedEncoder : virtual public Encoder {
  public:
   using T = typename DType::c_type;

   using Encoder::Put;

   virtual void Put(const T* src, int num_values) = 0;

   virtual void Put(const std::vector<T>& src, int num_values = -1);

   virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
                          int64_t valid_bits_offset) = 0;
 };

 template <typename DType>
 void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
   if (num_values == -1) {
     num_values = static_cast<int>(src.size());
   }
   Put(src.data(), num_values);
 }

 template <>
 inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
   // NOTE(wesm): This stub is here only to satisfy the compiler; it is
   // overridden later with the actual implementation
 }

 // Base class for dictionary encoders
 template <typename DType>
 class DictEncoder : virtual public TypedEncoder<DType> {
  public:
   /// Writes out any buffered indices to buffer preceded by the bit width of this data.
   /// Returns the number of bytes written.
   /// If the supplied buffer is not big enough, returns -1.
   /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
   /// to size buffer.
   virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;

   virtual int dict_encoded_size() const = 0;

   virtual int bit_width() const = 0;

   /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
   /// dict_encoded_size() bytes.
   virtual void WriteDict(uint8_t* buffer) const = 0;

   virtual int num_entries() const = 0;

   /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
   /// assumed (without any boundschecking) that the indices reference
   /// preexisting dictionary values
   /// \param[in] indices the dictionary index values. Only Int32Array currently
   /// supported
   virtual void PutIndices(const ::arrow::Array& indices) = 0;

   /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
   /// separately. Currently throws exception if the current dictionary memo is
   /// non-empty
   /// \param[in] values the dictionary values. Only valid for certain
   /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
   virtual void PutDictionary(const ::arrow::Array& values) = 0;
 };

 // ----------------------------------------------------------------------
 // Value decoding

 class Decoder {
  public:
   virtual ~Decoder() = default;

   // Sets the data for a new page. This will be called multiple times on the same
   // decoder and should reset all internal state.
   //
   // `num_values` comes from the data page header, and may be greater than the number of
   // physical values in the data buffer if there are some omitted (null) values.
   // `len`, on the other hand, is the size in bytes of the data buffer and
   // directly relates to the number of physical values.
   virtual void SetData(int num_values, const uint8_t* data, int len) = 0;

   // Returns the number of values left (for the last call to SetData()). This is
   // the number of values left in this page.
   virtual int values_left() const = 0;
   virtual Encoding::type encoding() const = 0;
 };

 template <typename DType>
 class TypedDecoder : virtual public Decoder {
  public:
   using T = typename DType::c_type;

   /// \brief Decode values into a buffer
   ///
   /// Subclasses may override the more specialized Decode methods below.
   ///
   /// \param[in] buffer destination for decoded values
   /// \param[in] max_values maximum number of values to decode
   /// \return The number of values decoded. Should be identical to max_values except
   /// at the end of the current data page.
   virtual int Decode(T* buffer, int max_values) = 0;

   /// \brief Decode the values in this data page but leave spaces for null entries.
   ///
   /// \param[in] buffer destination for decoded values
   /// \param[in] num_values size of the def_levels and buffer arrays including the number
   /// of null slots
   /// \param[in] null_count number of null slots
   /// \param[in] valid_bits bitmap data indicating position of valid slots
   /// \param[in] valid_bits_offset offset into valid_bits
   /// \return The number of values decoded, including nulls.
   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
                            const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;

   /// \brief Decode into an ArrayBuilder or other accumulator
   ///
   /// This function assumes the definition levels were already decoded
   /// as a validity bitmap in the given `valid_bits`.  `null_count`
   /// is the number of 0s in `valid_bits`.
   /// As a space optimization, it is allowed for `valid_bits` to be null
   /// if `null_count` is zero.
   ///
   /// \return number of values decoded
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
                           typename EncodingTraits<DType>::Accumulator* out) = 0;

   /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
   ///
   /// \return number of values decoded
   int DecodeArrowNonNull(int num_values,
                          typename EncodingTraits<DType>::Accumulator* out) {
     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
   }

   /// \brief Decode into a DictionaryBuilder
   ///
   /// This function assumes the definition levels were already decoded
   /// as a validity bitmap in the given `valid_bits`.  `null_count`
   /// is the number of 0s in `valid_bits`.
   /// As a space optimization, it is allowed for `valid_bits` to be null
   /// if `null_count` is zero.
   ///
   /// \return number of values decoded
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
                           typename EncodingTraits<DType>::DictAccumulator* builder) = 0;

   /// \brief Decode into a DictionaryBuilder ignoring nulls
   ///
   /// \return number of values decoded
   int DecodeArrowNonNull(int num_values,
                          typename EncodingTraits<DType>::DictAccumulator* builder) {
     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
   }
 };

 template <typename DType>
 class DictDecoder : virtual public TypedDecoder<DType> {
  public:
   using T = typename DType::c_type;

   virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;

   /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
   /// but do not append any indices
   virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;

   /// \brief Decode only dictionary indices and append to dictionary
   /// builder. The builder must have had the dictionary from this decoder
   /// inserted already.
   ///
   /// \warning Remember to reset the builder each time the dict decoder is initialized
   /// with a new dictionary page
   virtual int DecodeIndicesSpaced(int num_values, int null_count,
                                   const uint8_t* valid_bits, int64_t valid_bits_offset,
                                   ::arrow::ArrayBuilder* builder) = 0;

   /// \brief Decode only dictionary indices (no nulls)
   ///
   /// \warning Remember to reset the builder each time the dict decoder is initialized
   /// with a new dictionary page
   virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;

   /// \brief Decode only dictionary indices (no nulls). Same as above
   /// DecodeIndices but target is an array instead of a builder.
   ///
   /// \note API EXPERIMENTAL
   virtual int DecodeIndices(int num_values, int32_t* indices) = 0;

   /// \brief Get dictionary. The reader will call this API when it encounters a
   /// new dictionary.
   ///
   /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
   /// the decoder and is destroyed when the decoder is destroyed.
   /// @param[out] dictionary_length The dictionary length.
   ///
   /// \note API EXPERIMENTAL
   virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
 };

 // ----------------------------------------------------------------------
 // TypedEncoder specializations, traits, and factory functions

 class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
  public:
   using TypedDecoder<BooleanType>::Decode;

   /// \brief Decode and bit-pack values into a buffer
   ///
   /// \param[in] buffer destination for decoded values
   /// This buffer will contain bit-packed values. If
   /// max_values is not a multiple of 8, the trailing bits
   /// of the last byte will be undefined.
   /// \param[in] max_values max values to decode.
   /// \return The number of values decoded. Should be identical to max_values except
   /// at the end of the current data page.
   virtual int Decode(uint8_t* buffer, int max_values) = 0;
 };

 class FLBADecoder : virtual public TypedDecoder<FLBAType> {
  public:
   using TypedDecoder<FLBAType>::DecodeSpaced;

   // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
   // there is value in adding specialized read methods for
   // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
   // then perhaps not
 };

 PARQUET_EXPORT
 std::unique_ptr<Encoder> MakeEncoder(
     Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
     const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

 template <typename DType>
 std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
     Encoding::type encoding, bool use_dictionary = false,
     const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = typename EncodingTraits<DType>::Encoder;
   std::unique_ptr<Encoder> base =
       MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
 }

 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDecoder(
     Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

 namespace detail {

 PARQUET_EXPORT
 std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
                                          const ColumnDescriptor* descr,
                                          ::arrow::MemoryPool* pool);

 }  // namespace detail

 template <typename DType>
 std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
     const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = DictDecoder<DType>;
   auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
 }

 template <typename DType>
 std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
     Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
   using OutType = typename EncodingTraits<DType>::Decoder;
   std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
   return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
 }

 }  // namespace parquet
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <cstdint>
	#include <cstring>
	#include <memory>
	#include <vector>

	#include "arrow/type_fwd.h"

	#include "parquet/exception.h"
	#include "parquet/platform.h"
	#include "parquet/types.h"

	namespace arrow {
	template <typename T>
	class Dictionary32Builder;
	}

	namespace parquet {

	template <typename DType>
	class TypedEncoder;

	using BooleanEncoder = TypedEncoder<BooleanType>;
	using Int32Encoder = TypedEncoder<Int32Type>;
	using Int64Encoder = TypedEncoder<Int64Type>;
	using Int96Encoder = TypedEncoder<Int96Type>;
	using FloatEncoder = TypedEncoder<FloatType>;
	using DoubleEncoder = TypedEncoder<DoubleType>;
	using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
	using FLBAEncoder = TypedEncoder<FLBAType>;

	template <typename DType>
	class TypedDecoder;

	class BooleanDecoder;
	using Int32Decoder = TypedDecoder<Int32Type>;
	using Int64Decoder = TypedDecoder<Int64Type>;
	using Int96Decoder = TypedDecoder<Int96Type>;
	using FloatDecoder = TypedDecoder<FloatType>;
	using DoubleDecoder = TypedDecoder<DoubleType>;
	using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
	class FLBADecoder;

	template <typename T>
	struct EncodingTraits;

	template <>
	struct EncodingTraits<BooleanType> {
	using Encoder = BooleanEncoder;
	using Decoder = BooleanDecoder;

	using ArrowType = ::arrow::BooleanType;
	using Accumulator = ::arrow::BooleanBuilder;
	struct DictAccumulator {};
	};

	template <>
	struct EncodingTraits<Int32Type> {
	using Encoder = Int32Encoder;
	using Decoder = Int32Decoder;

	using ArrowType = ::arrow::Int32Type;
	using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
	};

	template <>
	struct EncodingTraits<Int64Type> {
	using Encoder = Int64Encoder;
	using Decoder = Int64Decoder;

	using ArrowType = ::arrow::Int64Type;
	using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
	};

	template <>
	struct EncodingTraits<Int96Type> {
	using Encoder = Int96Encoder;
	using Decoder = Int96Decoder;

	struct Accumulator {};
	struct DictAccumulator {};
	};

	template <>
	struct EncodingTraits<FloatType> {
	using Encoder = FloatEncoder;
	using Decoder = FloatDecoder;

	using ArrowType = ::arrow::FloatType;
	using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
	};

	template <>
	struct EncodingTraits<DoubleType> {
	using Encoder = DoubleEncoder;
	using Decoder = DoubleDecoder;

	using ArrowType = ::arrow::DoubleType;
	using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
	};

	template <>
	struct EncodingTraits<ByteArrayType> {
	using Encoder = ByteArrayEncoder;
	using Decoder = ByteArrayDecoder;

	/// \brief Internal helper class for decoding BYTE_ARRAY data
	///
	/// This class allows the caller to choose the concrete Arrow data type
	/// by passing a corresponding `ArrayBuilder`.
	/// Supported `ArrayBuilder` classes are `BinaryBuilder`, `LargeBinaryBuilder`
	/// and `BinaryViewBuilder`.
	/// If the builder is a `BinaryBuilder`, `chunks` can accumulate several
	/// arrays as needed to work around the 32-bit offset limit.
	struct Accumulator {
	std::unique_ptr<::arrow::ArrayBuilder> builder;
	std::vector<std::shared_ptr<::arrow::Array>> chunks;
	};
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
	};

	template <>
	struct EncodingTraits<FLBAType> {
	using Encoder = FLBAEncoder;
	using Decoder = FLBADecoder;

	using ArrowType = ::arrow::FixedSizeBinaryType;
	using Accumulator = ::arrow::FixedSizeBinaryBuilder;
	using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
	};

	class ColumnDescriptor;

	// Untyped base for all encoders
	class Encoder {
	public:
	virtual ~Encoder() = default;

	virtual int64_t EstimatedDataEncodedSize() = 0;
	virtual std::shared_ptr<Buffer> FlushValues() = 0;
	virtual Encoding::type encoding() const = 0;

	virtual void Put(const ::arrow::Array& values) = 0;

	// Report the number of bytes written to the encoder since the last report.
	// It only works for BYTE_ARRAY type and throw for other types.
	// This call is not idempotent since it resets the internal counter.
	virtual int64_t ReportUnencodedDataBytes() = 0;

	virtual MemoryPool* memory_pool() const = 0;
	};

	// Base class for value encoders. Since encoders may or not have state (e.g.,
	// dictionary encoding) we use a class instance to maintain any state.
	//
	// Encode interfaces are internal, subject to change without deprecation.
	template <typename DType>
	class TypedEncoder : virtual public Encoder {
	public:
	using T = typename DType::c_type;

	using Encoder::Put;

	virtual void Put(const T* src, int num_values) = 0;

	virtual void Put(const std::vector<T>& src, int num_values = -1);

	virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
	int64_t valid_bits_offset) = 0;
	};

	template <typename DType>
	void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
	if (num_values == -1) {
	num_values = static_cast<int>(src.size());
	}
	Put(src.data(), num_values);
	}

	template <>
	inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
	// NOTE(wesm): This stub is here only to satisfy the compiler; it is
	// overridden later with the actual implementation
	}

	// Base class for dictionary encoders
	template <typename DType>
	class DictEncoder : virtual public TypedEncoder<DType> {
	public:
	/// Writes out any buffered indices to buffer preceded by the bit width of this data.
	/// Returns the number of bytes written.
	/// If the supplied buffer is not big enough, returns -1.
	/// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
	/// to size buffer.
	virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;

	virtual int dict_encoded_size() const = 0;

	virtual int bit_width() const = 0;

	/// Writes out the encoded dictionary to buffer. buffer must be preallocated to
	/// dict_encoded_size() bytes.
	virtual void WriteDict(uint8_t* buffer) const = 0;

	virtual int num_entries() const = 0;

	/// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
	/// assumed (without any boundschecking) that the indices reference
	/// preexisting dictionary values
	/// \param[in] indices the dictionary index values. Only Int32Array currently
	/// supported
	virtual void PutIndices(const ::arrow::Array& indices) = 0;

	/// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
	/// separately. Currently throws exception if the current dictionary memo is
	/// non-empty
	/// \param[in] values the dictionary values. Only valid for certain
	/// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
	virtual void PutDictionary(const ::arrow::Array& values) = 0;
	};

	// ----------------------------------------------------------------------
	// Value decoding

	class Decoder {
	public:
	virtual ~Decoder() = default;

	// Sets the data for a new page. This will be called multiple times on the same
	// decoder and should reset all internal state.
	//
	// `num_values` comes from the data page header, and may be greater than the number of
	// physical values in the data buffer if there are some omitted (null) values.
	// `len`, on the other hand, is the size in bytes of the data buffer and
	// directly relates to the number of physical values.
	virtual void SetData(int num_values, const uint8_t* data, int len) = 0;

	// Returns the number of values left (for the last call to SetData()). This is
	// the number of values left in this page.
	virtual int values_left() const = 0;
	virtual Encoding::type encoding() const = 0;
	};

	template <typename DType>
	class TypedDecoder : virtual public Decoder {
	public:
	using T = typename DType::c_type;

	/// \brief Decode values into a buffer
	///
	/// Subclasses may override the more specialized Decode methods below.
	///
	/// \param[in] buffer destination for decoded values
	/// \param[in] max_values maximum number of values to decode
	/// \return The number of values decoded. Should be identical to max_values except
	/// at the end of the current data page.
	virtual int Decode(T* buffer, int max_values) = 0;

	/// \brief Decode the values in this data page but leave spaces for null entries.
	///
	/// \param[in] buffer destination for decoded values
	/// \param[in] num_values size of the def_levels and buffer arrays including the number
	/// of null slots
	/// \param[in] null_count number of null slots
	/// \param[in] valid_bits bitmap data indicating position of valid slots
	/// \param[in] valid_bits_offset offset into valid_bits
	/// \return The number of values decoded, including nulls.
	virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
	const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;

	/// \brief Decode into an ArrayBuilder or other accumulator
	///
	/// This function assumes the definition levels were already decoded
	/// as a validity bitmap in the given `valid_bits`. `null_count`
	/// is the number of 0s in `valid_bits`.
	/// As a space optimization, it is allowed for `valid_bits` to be null
	/// if `null_count` is zero.
	///
	/// \return number of values decoded
	virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
	int64_t valid_bits_offset,
	typename EncodingTraits<DType>::Accumulator* out) = 0;

	/// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
	///
	/// \return number of values decoded
	int DecodeArrowNonNull(int num_values,
	typename EncodingTraits<DType>::Accumulator* out) {
	return DecodeArrow(num_values, 0, /valid_bits=/NULLPTR, 0, out);
	}

	/// \brief Decode into a DictionaryBuilder
	///
	/// This function assumes the definition levels were already decoded
	/// as a validity bitmap in the given `valid_bits`. `null_count`
	/// is the number of 0s in `valid_bits`.
	/// As a space optimization, it is allowed for `valid_bits` to be null
	/// if `null_count` is zero.
	///
	/// \return number of values decoded
	virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
	int64_t valid_bits_offset,
	typename EncodingTraits<DType>::DictAccumulator* builder) = 0;

	/// \brief Decode into a DictionaryBuilder ignoring nulls
	///
	/// \return number of values decoded
	int DecodeArrowNonNull(int num_values,
	typename EncodingTraits<DType>::DictAccumulator* builder) {
	return DecodeArrow(num_values, 0, /valid_bits=/NULLPTR, 0, builder);
	}
	};

	template <typename DType>
	class DictDecoder : virtual public TypedDecoder<DType> {
	public:
	using T = typename DType::c_type;

	virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;

	/// \brief Insert dictionary values into the Arrow dictionary builder's memo,
	/// but do not append any indices
	virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;

	/// \brief Decode only dictionary indices and append to dictionary
	/// builder. The builder must have had the dictionary from this decoder
	/// inserted already.
	///
	/// \warning Remember to reset the builder each time the dict decoder is initialized
	/// with a new dictionary page
	virtual int DecodeIndicesSpaced(int num_values, int null_count,
	const uint8_t* valid_bits, int64_t valid_bits_offset,
	::arrow::ArrayBuilder* builder) = 0;

	/// \brief Decode only dictionary indices (no nulls)
	///
	/// \warning Remember to reset the builder each time the dict decoder is initialized
	/// with a new dictionary page
	virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;

	/// \brief Decode only dictionary indices (no nulls). Same as above
	/// DecodeIndices but target is an array instead of a builder.
	///
	/// \note API EXPERIMENTAL
	virtual int DecodeIndices(int num_values, int32_t* indices) = 0;

	/// \brief Get dictionary. The reader will call this API when it encounters a
	/// new dictionary.
	///
	/// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
	/// the decoder and is destroyed when the decoder is destroyed.
	/// @param[out] dictionary_length The dictionary length.
	///
	/// \note API EXPERIMENTAL
	virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
	};

	// ----------------------------------------------------------------------
	// TypedEncoder specializations, traits, and factory functions

	class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
	public:
	using TypedDecoder<BooleanType>::Decode;

	/// \brief Decode and bit-pack values into a buffer
	///
	/// \param[in] buffer destination for decoded values
	/// This buffer will contain bit-packed values. If
	/// max_values is not a multiple of 8, the trailing bits
	/// of the last byte will be undefined.
	/// \param[in] max_values max values to decode.
	/// \return The number of values decoded. Should be identical to max_values except
	/// at the end of the current data page.
	virtual int Decode(uint8_t* buffer, int max_values) = 0;
	};

	class FLBADecoder : virtual public TypedDecoder<FLBAType> {
	public:
	using TypedDecoder<FLBAType>::DecodeSpaced;

	// TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
	// there is value in adding specialized read methods for
	// FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
	// then perhaps not
	};

	PARQUET_EXPORT
	std::unique_ptr<Encoder> MakeEncoder(
	Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
	const ColumnDescriptor* descr = NULLPTR,
	::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

	template <typename DType>
	std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
	Encoding::type encoding, bool use_dictionary = false,
	const ColumnDescriptor* descr = NULLPTR,
	::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
	using OutType = typename EncodingTraits<DType>::Encoder;
	std::unique_ptr<Encoder> base =
	MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
	}

	PARQUET_EXPORT
	std::unique_ptr<Decoder> MakeDecoder(
	Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
	::arrow::MemoryPool* pool = ::arrow::default_memory_pool());

	namespace detail {

	PARQUET_EXPORT
	std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
	const ColumnDescriptor* descr,
	::arrow::MemoryPool* pool);

	} // namespace detail

	template <typename DType>
	std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
	const ColumnDescriptor* descr = NULLPTR,
	::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
	using OutType = DictDecoder<DType>;
	auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
	}

	template <typename DType>
	std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
	Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
	::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
	using OutType = typename EncodingTraits<DType>::Decoder;
	std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
	}

	} // namespace parquet