cpp/src/parquet/properties.h - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #ifndef PARQUET_COLUMN_PROPERTIES_H
 #define PARQUET_COLUMN_PROPERTIES_H

 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>

 #include "arrow/type.h"
 #include "arrow/util/compression.h"

 #include "parquet/exception.h"
 #include "parquet/parquet_version.h"
 #include "parquet/platform.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"

 namespace parquet {

 struct ParquetVersion {
   enum type { PARQUET_1_0, PARQUET_2_0 };
 };

 static int64_t DEFAULT_BUFFER_SIZE = 1024;
 static bool DEFAULT_USE_BUFFERED_STREAM = false;

 class PARQUET_EXPORT ReaderProperties {
  public:
   explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
       : pool_(pool) {
     buffered_stream_enabled_ = DEFAULT_USE_BUFFERED_STREAM;
     buffer_size_ = DEFAULT_BUFFER_SIZE;
   }

   MemoryPool* memory_pool() const { return pool_; }

   std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
                                               int64_t start, int64_t num_bytes);

   bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }

   void enable_buffered_stream() { buffered_stream_enabled_ = true; }

   void disable_buffered_stream() { buffered_stream_enabled_ = false; }

   void set_buffer_size(int64_t buf_size) { buffer_size_ = buf_size; }

   int64_t buffer_size() const { return buffer_size_; }

  private:
   MemoryPool* pool_;
   int64_t buffer_size_;
   bool buffered_stream_enabled_;
 };

 ReaderProperties PARQUET_EXPORT default_reader_properties();

 static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
 static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
 static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
 static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
 static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
 static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
 static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
 static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
 static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION =
     ParquetVersion::PARQUET_1_0;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;

 class PARQUET_EXPORT ColumnProperties {
  public:
   ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
                    Compression::type codec = DEFAULT_COMPRESSION_TYPE,
                    bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
                    bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
                    size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
       : encoding_(encoding),
         codec_(codec),
         dictionary_enabled_(dictionary_enabled),
         statistics_enabled_(statistics_enabled),
         max_stats_size_(max_stats_size),
         compression_level_(Codec::UseDefaultCompressionLevel()) {}

   void set_encoding(Encoding::type encoding) { encoding_ = encoding; }

   void set_compression(Compression::type codec) { codec_ = codec; }

   void set_dictionary_enabled(bool dictionary_enabled) {
     dictionary_enabled_ = dictionary_enabled;
   }

   void set_statistics_enabled(bool statistics_enabled) {
     statistics_enabled_ = statistics_enabled;
   }

   void set_max_statistics_size(size_t max_stats_size) {
     max_stats_size_ = max_stats_size;
   }

   void set_compression_level(int compression_level) {
     compression_level_ = compression_level;
   }

   Encoding::type encoding() const { return encoding_; }

   Compression::type compression() const { return codec_; }

   bool dictionary_enabled() const { return dictionary_enabled_; }

   bool statistics_enabled() const { return statistics_enabled_; }

   size_t max_statistics_size() const { return max_stats_size_; }

   int compression_level() const { return compression_level_; }

  private:
   Encoding::type encoding_;
   Compression::type codec_;
   bool dictionary_enabled_;
   bool statistics_enabled_;
   size_t max_stats_size_;
   int compression_level_;
 };

 class PARQUET_EXPORT WriterProperties {
  public:
   class Builder {
    public:
     Builder()
         : pool_(::arrow::default_memory_pool()),
           dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
           write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
           max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
           pagesize_(kDefaultDataPageSize),
           version_(DEFAULT_WRITER_VERSION),
           created_by_(DEFAULT_CREATED_BY) {}
     virtual ~Builder() {}

     Builder* memory_pool(MemoryPool* pool) {
       pool_ = pool;
       return this;
     }

     Builder* enable_dictionary() {
       default_column_properties_.set_dictionary_enabled(true);
       return this;
     }

     Builder* disable_dictionary() {
       default_column_properties_.set_dictionary_enabled(false);
       return this;
     }

     Builder* enable_dictionary(const std::string& path) {
       dictionary_enabled_[path] = true;
       return this;
     }

     Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
       return this->enable_dictionary(path->ToDotString());
     }

     Builder* disable_dictionary(const std::string& path) {
       dictionary_enabled_[path] = false;
       return this;
     }

     Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
       return this->disable_dictionary(path->ToDotString());
     }

     Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
       dictionary_pagesize_limit_ = dictionary_psize_limit;
       return this;
     }

     Builder* write_batch_size(int64_t write_batch_size) {
       write_batch_size_ = write_batch_size;
       return this;
     }

     Builder* max_row_group_length(int64_t max_row_group_length) {
       max_row_group_length_ = max_row_group_length;
       return this;
     }

     Builder* data_pagesize(int64_t pg_size) {
       pagesize_ = pg_size;
       return this;
     }

     Builder* version(ParquetVersion::type version) {
       version_ = version;
       return this;
     }

     Builder* created_by(const std::string& created_by) {
       created_by_ = created_by;
       return this;
     }

     /**
      * Define the encoding that is used when we don't utilise dictionary encoding.
      *
      * This either apply if dictionary encoding is disabled or if we fallback
      * as the dictionary grew too large.
      */
     Builder* encoding(Encoding::type encoding_type) {
       if (encoding_type == Encoding::PLAIN_DICTIONARY ||
           encoding_type == Encoding::RLE_DICTIONARY) {
         throw ParquetException("Can't use dictionary encoding as fallback encoding");
       }

       default_column_properties_.set_encoding(encoding_type);
       return this;
     }

     /**
      * Define the encoding that is used when we don't utilise dictionary encoding.
      *
      * This either apply if dictionary encoding is disabled or if we fallback
      * as the dictionary grew too large.
      */
     Builder* encoding(const std::string& path, Encoding::type encoding_type) {
       if (encoding_type == Encoding::PLAIN_DICTIONARY ||
           encoding_type == Encoding::RLE_DICTIONARY) {
         throw ParquetException("Can't use dictionary encoding as fallback encoding");
       }

       encodings_[path] = encoding_type;
       return this;
     }

     /**
      * Define the encoding that is used when we don't utilise dictionary encoding.
      *
      * This either apply if dictionary encoding is disabled or if we fallback
      * as the dictionary grew too large.
      */
     Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
                       Encoding::type encoding_type) {
       return this->encoding(path->ToDotString(), encoding_type);
     }

     Builder* compression(Compression::type codec) {
       default_column_properties_.set_compression(codec);
       return this;
     }

     Builder* max_statistics_size(size_t max_stats_sz) {
       default_column_properties_.set_max_statistics_size(max_stats_sz);
       return this;
     }

     Builder* compression(const std::string& path, Compression::type codec) {
       codecs_[path] = codec;
       return this;
     }

     Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
                          Compression::type codec) {
       return this->compression(path->ToDotString(), codec);
     }

     /// \brief Specify the default compression level for the compressor in
     /// every column.  In case a column does not have an explicitly specified
     /// compression level, the default one would be used.
     ///
     /// The provided compression level is compressor specific. The user would
     /// have to familiarize oneself with the available levels for the selected
     /// compressor.  If the compressor does not allow for selecting different
     /// compression levels, calling this function would not have any effect.
     /// Parquet and Arrow do not validate the passed compression level.  If no
     /// level is selected by the user or if the special
     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
     /// compression level.
     Builder* compression_level(int compression_level) {
       default_column_properties_.set_compression_level(compression_level);
       return this;
     }

     /// \brief Specify a compression level for the compressor for the column
     /// described by path.
     ///
     /// The provided compression level is compressor specific. The user would
     /// have to familiarize oneself with the available levels for the selected
     /// compressor.  If the compressor does not allow for selecting different
     /// compression levels, calling this function would not have any effect.
     /// Parquet and Arrow do not validate the passed compression level.  If no
     /// level is selected by the user or if the special
     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
     /// compression level.
     Builder* compression_level(const std::string& path, int compression_level) {
       codecs_compression_level_[path] = compression_level;
       return this;
     }

     /// \brief Specify a compression level for the compressor for the column
     /// described by path.
     ///
     /// The provided compression level is compressor specific. The user would
     /// have to familiarize oneself with the available levels for the selected
     /// compressor.  If the compressor does not allow for selecting different
     /// compression levels, calling this function would not have any effect.
     /// Parquet and Arrow do not validate the passed compression level.  If no
     /// level is selected by the user or if the special
     /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
     /// compression level.
     Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
                                int compression_level) {
       return this->compression_level(path->ToDotString(), compression_level);
     }

     Builder* enable_statistics() {
       default_column_properties_.set_statistics_enabled(true);
       return this;
     }

     Builder* disable_statistics() {
       default_column_properties_.set_statistics_enabled(false);
       return this;
     }

     Builder* enable_statistics(const std::string& path) {
       statistics_enabled_[path] = true;
       return this;
     }

     Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
       return this->enable_statistics(path->ToDotString());
     }

     Builder* disable_statistics(const std::string& path) {
       statistics_enabled_[path] = false;
       return this;
     }

     Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
       return this->disable_statistics(path->ToDotString());
     }

     std::shared_ptr<WriterProperties> build() {
       std::unordered_map<std::string, ColumnProperties> column_properties;
       auto get = [&](const std::string& key) -> ColumnProperties& {
         auto it = column_properties.find(key);
         if (it == column_properties.end())
           return column_properties[key] = default_column_properties_;
         else
           return it->second;
       };

       for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
       for (const auto& item : codecs_) get(item.first).set_compression(item.second);
       for (const auto& item : codecs_compression_level_)
         get(item.first).set_compression_level(item.second);
       for (const auto& item : dictionary_enabled_)
         get(item.first).set_dictionary_enabled(item.second);
       for (const auto& item : statistics_enabled_)
         get(item.first).set_statistics_enabled(item.second);

       return std::shared_ptr<WriterProperties>(
           new WriterProperties(pool_, dictionary_pagesize_limit_, write_batch_size_,
                                max_row_group_length_, pagesize_, version_, created_by_,
                                default_column_properties_, column_properties));
     }

    private:
     MemoryPool* pool_;
     int64_t dictionary_pagesize_limit_;
     int64_t write_batch_size_;
     int64_t max_row_group_length_;
     int64_t pagesize_;
     ParquetVersion::type version_;
     std::string created_by_;

     // Settings used for each column unless overridden in any of the maps below
     ColumnProperties default_column_properties_;
     std::unordered_map<std::string, Encoding::type> encodings_;
     std::unordered_map<std::string, Compression::type> codecs_;
     std::unordered_map<std::string, int32_t> codecs_compression_level_;
     std::unordered_map<std::string, bool> dictionary_enabled_;
     std::unordered_map<std::string, bool> statistics_enabled_;
   };

   inline MemoryPool* memory_pool() const { return pool_; }

   inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }

   inline int64_t write_batch_size() const { return write_batch_size_; }

   inline int64_t max_row_group_length() const { return max_row_group_length_; }

   inline int64_t data_pagesize() const { return pagesize_; }

   inline ParquetVersion::type version() const { return parquet_version_; }

   inline std::string created_by() const { return parquet_created_by_; }

   inline Encoding::type dictionary_index_encoding() const {
     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
       return Encoding::PLAIN_DICTIONARY;
     } else {
       return Encoding::RLE_DICTIONARY;
     }
   }

   inline Encoding::type dictionary_page_encoding() const {
     if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
       return Encoding::PLAIN_DICTIONARY;
     } else {
       return Encoding::PLAIN;
     }
   }

   const ColumnProperties& column_properties(
       const std::shared_ptr<schema::ColumnPath>& path) const {
     auto it = column_properties_.find(path->ToDotString());
     if (it != column_properties_.end()) return it->second;
     return default_column_properties_;
   }

   Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).encoding();
   }

   Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).compression();
   }

   int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).compression_level();
   }

   bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).dictionary_enabled();
   }

   bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).statistics_enabled();
   }

   size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
     return column_properties(path).max_statistics_size();
   }

  private:
   explicit WriterProperties(
       MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
       int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
       const std::string& created_by, const ColumnProperties& default_column_properties,
       const std::unordered_map<std::string, ColumnProperties>& column_properties)
       : pool_(pool),
         dictionary_pagesize_limit_(dictionary_pagesize_limit),
         write_batch_size_(write_batch_size),
         max_row_group_length_(max_row_group_length),
         pagesize_(pagesize),
         parquet_version_(version),
         parquet_created_by_(created_by),
         default_column_properties_(default_column_properties),
         column_properties_(column_properties) {}

   MemoryPool* pool_;
   int64_t dictionary_pagesize_limit_;
   int64_t write_batch_size_;
   int64_t max_row_group_length_;
   int64_t pagesize_;
   ParquetVersion::type parquet_version_;
   std::string parquet_created_by_;
   ColumnProperties default_column_properties_;
   std::unordered_map<std::string, ColumnProperties> column_properties_;
 };

 std::shared_ptr<WriterProperties> PARQUET_EXPORT default_writer_properties();

 // ----------------------------------------------------------------------
 // Properties specific to Apache Arrow columnar read and write

 static constexpr bool kArrowDefaultUseThreads = false;

 // Default number of rows to read when using ::arrow::RecordBatchReader
 static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;

 /// EXPERIMENTAL: Properties for configuring FileReader behavior.
 class PARQUET_EXPORT ArrowReaderProperties {
  public:
   explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
       : use_threads_(use_threads),
         read_dict_indices_(),
         batch_size_(kArrowDefaultBatchSize) {}

   void set_use_threads(bool use_threads) { use_threads_ = use_threads; }

   bool use_threads() const { return use_threads_; }

   void set_read_dictionary(int column_index, bool read_dict) {
     if (read_dict) {
       read_dict_indices_.insert(column_index);
     } else {
       read_dict_indices_.erase(column_index);
     }
   }
   bool read_dictionary(int column_index) const {
     if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
       return true;
     } else {
       return false;
     }
   }

   void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }

   int64_t batch_size() const { return batch_size_; }

  private:
   bool use_threads_;
   std::unordered_set<int> read_dict_indices_;
   int64_t batch_size_;
 };

 /// EXPERIMENTAL: Constructs the default ArrowReaderProperties
 PARQUET_EXPORT
 ArrowReaderProperties default_arrow_reader_properties();

 class PARQUET_EXPORT ArrowWriterProperties {
  public:
   class Builder {
    public:
     Builder()
         : write_timestamps_as_int96_(false),
           coerce_timestamps_enabled_(false),
           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
           truncated_timestamps_allowed_(false),
           store_schema_(false) {}
     virtual ~Builder() {}

     Builder* disable_deprecated_int96_timestamps() {
       write_timestamps_as_int96_ = false;
       return this;
     }

     Builder* enable_deprecated_int96_timestamps() {
       write_timestamps_as_int96_ = true;
       return this;
     }

     Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
       coerce_timestamps_enabled_ = true;
       coerce_timestamps_unit_ = unit;
       return this;
     }

     Builder* allow_truncated_timestamps() {
       truncated_timestamps_allowed_ = true;
       return this;
     }

     Builder* disallow_truncated_timestamps() {
       truncated_timestamps_allowed_ = false;
       return this;
     }

     /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
     /// to enable certain read options (like "read_dictionary") to be set
     /// automatically
     Builder* store_schema() {
       store_schema_ = true;
       return this;
     }

     std::shared_ptr<ArrowWriterProperties> build() {
       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
           write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
           truncated_timestamps_allowed_, store_schema_));
     }

    private:
     bool write_timestamps_as_int96_;

     bool coerce_timestamps_enabled_;
     ::arrow::TimeUnit::type coerce_timestamps_unit_;
     bool truncated_timestamps_allowed_;

     bool store_schema_;
   };

   bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }

   bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
   ::arrow::TimeUnit::type coerce_timestamps_unit() const {
     return coerce_timestamps_unit_;
   }

   bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }

   bool store_schema() const { return store_schema_; }

  private:
   explicit ArrowWriterProperties(bool write_nanos_as_int96,
                                  bool coerce_timestamps_enabled,
                                  ::arrow::TimeUnit::type coerce_timestamps_unit,
                                  bool truncated_timestamps_allowed, bool store_schema)
       : write_timestamps_as_int96_(write_nanos_as_int96),
         coerce_timestamps_enabled_(coerce_timestamps_enabled),
         coerce_timestamps_unit_(coerce_timestamps_unit),
         truncated_timestamps_allowed_(truncated_timestamps_allowed),
         store_schema_(store_schema) {}

   const bool write_timestamps_as_int96_;
   const bool coerce_timestamps_enabled_;
   const ::arrow::TimeUnit::type coerce_timestamps_unit_;
   const bool truncated_timestamps_allowed_;
   const bool store_schema_;
 };

 /// \brief State object used for writing Arrow data directly to a Parquet
 /// column chunk. API possibly not stable
 struct ArrowWriteContext {
   ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
       : memory_pool(memory_pool),
         properties(properties),
         data_buffer(AllocateBuffer(memory_pool)),
         def_levels_buffer(AllocateBuffer(memory_pool)) {}

   template <typename T>
   ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
     ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
     *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
     return ::arrow::Status::OK();
   }

   MemoryPool* memory_pool;
   const ArrowWriterProperties* properties;

   // Buffer used for storing the data of an array converted to the physical type
   // as expected by parquet-cpp.
   std::shared_ptr<ResizableBuffer> data_buffer;

   // We use the shared ownership of this buffer
   std::shared_ptr<ResizableBuffer> def_levels_buffer;
 };

 PARQUET_EXPORT
 std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();

 }  // namespace parquet

 #endif  // PARQUET_COLUMN_PROPERTIES_H