src/parquet/encoding.h - parquet-cpp - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #ifndef PARQUET_ENCODING_H
 #define PARQUET_ENCODING_H

 #include <cstdint>
 #include <memory>

 #include <arrow/util/bit-util.h>

 #include "parquet/exception.h"
 #include "parquet/schema.h"
 #include "parquet/types.h"
 #include "parquet/util/bit-util.h"
 #include "parquet/util/memory.h"

 namespace parquet {

 class ColumnDescriptor;

 // Base class for value encoders. Since encoders may or not have state (e.g.,
 // dictionary encoding) we use a class instance to maintain any state.
 //
 // TODO(wesm): Encode interface API is temporary
 template <typename DType>
 class Encoder {
  public:
   typedef typename DType::c_type T;

   virtual ~Encoder() {}

   virtual int64_t EstimatedDataEncodedSize() = 0;
   virtual std::shared_ptr<Buffer> FlushValues() = 0;
   virtual void Put(const T* src, int num_values) = 0;
   virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
       int64_t valid_bits_offset) {
     PoolBuffer buffer(pool_);
     buffer.Resize(num_values * sizeof(T));
     int32_t num_valid_values = 0;
     INIT_BITSET(valid_bits, valid_bits_offset);
     T* data = reinterpret_cast<T*>(buffer.mutable_data());
     for (int32_t i = 0; i < num_values; i++) {
       if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
         data[num_valid_values++] = src[i];
       }
       READ_NEXT_BITSET(valid_bits);
     }
     Put(data, num_valid_values);
   }

   Encoding::type encoding() const { return encoding_; }

  protected:
   explicit Encoder(
       const ColumnDescriptor* descr, Encoding::type encoding, ::arrow::MemoryPool* pool)
       : descr_(descr), encoding_(encoding), pool_(pool) {}

   // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
   const ColumnDescriptor* descr_;
   const Encoding::type encoding_;
   ::arrow::MemoryPool* pool_;
 };

 // The Decoder template is parameterized on parquet::DataType subclasses
 template <typename DType>
 class Decoder {
  public:
   typedef typename DType::c_type T;

   virtual ~Decoder() {}

   // Sets the data for a new page. This will be called multiple times on the same
   // decoder and should reset all internal state.
   virtual void SetData(int num_values, const uint8_t* data, int len) = 0;

   // Subclasses should override the ones they support. In each of these functions,
   // the decoder would decode put to 'max_values', storing the result in 'buffer'.
   // The function returns the number of values decoded, which should be max_values
   // except for end of the current data page.
   virtual int Decode(T* buffer, int max_values) {
     throw ParquetException("Decoder does not implement this type.");
   }

   // Decode the values in this data page but leave spaces for null entries.
   //
   // num_values is the size of the def_levels and buffer arrays including the number of
   // null values.
   virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
       const uint8_t* valid_bits, int64_t valid_bits_offset) {
     int values_to_read = num_values - null_count;
     int values_read = Decode(buffer, values_to_read);
     if (values_read != values_to_read) {
       throw ParquetException("Number of values / definition_levels read did not match");
     }

     // Add spacing for null entries. As we have filled the buffer from the front,
     // we need to add the spacing from the back.
     int values_to_move = values_read;
     for (int i = num_values - 1; i >= 0; i--) {
       if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
         buffer[i] = buffer[--values_to_move];
       }
     }
     return num_values;
   }

   // Returns the number of values left (for the last call to SetData()). This is
   // the number of values left in this page.
   int values_left() const { return num_values_; }

   Encoding::type encoding() const { return encoding_; }

  protected:
   explicit Decoder(const ColumnDescriptor* descr, Encoding::type encoding)
       : descr_(descr), encoding_(encoding), num_values_(0) {}

   // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
   const ColumnDescriptor* descr_;

   const Encoding::type encoding_;
   int num_values_;
 };

 }  // namespace parquet

 #endif  // PARQUET_ENCODING_H
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#ifndef PARQUET_ENCODING_H
	#define PARQUET_ENCODING_H

	#include <cstdint>
	#include <memory>

	#include <arrow/util/bit-util.h>

	#include "parquet/exception.h"
	#include "parquet/schema.h"
	#include "parquet/types.h"
	#include "parquet/util/bit-util.h"
	#include "parquet/util/memory.h"

	namespace parquet {

	class ColumnDescriptor;

	// Base class for value encoders. Since encoders may or not have state (e.g.,
	// dictionary encoding) we use a class instance to maintain any state.
	//
	// TODO(wesm): Encode interface API is temporary
	template <typename DType>
	class Encoder {
	public:
	typedef typename DType::c_type T;

	virtual ~Encoder() {}

	virtual int64_t EstimatedDataEncodedSize() = 0;
	virtual std::shared_ptr<Buffer> FlushValues() = 0;
	virtual void Put(const T* src, int num_values) = 0;
	virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
	int64_t valid_bits_offset) {
	PoolBuffer buffer(pool_);
	buffer.Resize(num_values * sizeof(T));
	int32_t num_valid_values = 0;
	INIT_BITSET(valid_bits, valid_bits_offset);
	T* data = reinterpret_cast<T*>(buffer.mutable_data());
	for (int32_t i = 0; i < num_values; i++) {
	if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
	data[num_valid_values++] = src[i];
	}
	READ_NEXT_BITSET(valid_bits);
	}
	Put(data, num_valid_values);
	}

	Encoding::type encoding() const { return encoding_; }

	protected:
	explicit Encoder(
	const ColumnDescriptor* descr, Encoding::type encoding, ::arrow::MemoryPool* pool)
	: descr_(descr), encoding_(encoding), pool_(pool) {}

	// For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
	const ColumnDescriptor* descr_;
	const Encoding::type encoding_;
	::arrow::MemoryPool* pool_;
	};

	// The Decoder template is parameterized on parquet::DataType subclasses
	template <typename DType>
	class Decoder {
	public:
	typedef typename DType::c_type T;

	virtual ~Decoder() {}

	// Sets the data for a new page. This will be called multiple times on the same
	// decoder and should reset all internal state.
	virtual void SetData(int num_values, const uint8_t* data, int len) = 0;

	// Subclasses should override the ones they support. In each of these functions,
	// the decoder would decode put to 'max_values', storing the result in 'buffer'.
	// The function returns the number of values decoded, which should be max_values
	// except for end of the current data page.
	virtual int Decode(T* buffer, int max_values) {
	throw ParquetException("Decoder does not implement this type.");
	}

	// Decode the values in this data page but leave spaces for null entries.
	//
	// num_values is the size of the def_levels and buffer arrays including the number of
	// null values.
	virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
	const uint8_t* valid_bits, int64_t valid_bits_offset) {
	int values_to_read = num_values - null_count;
	int values_read = Decode(buffer, values_to_read);
	if (values_read != values_to_read) {
	throw ParquetException("Number of values / definition_levels read did not match");
	}

	// Add spacing for null entries. As we have filled the buffer from the front,
	// we need to add the spacing from the back.
	int values_to_move = values_read;
	for (int i = num_values - 1; i >= 0; i--) {
	if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
	buffer[i] = buffer[--values_to_move];
	}
	}
	return num_values;
	}

	// Returns the number of values left (for the last call to SetData()). This is
	// the number of values left in this page.
	int values_left() const { return num_values_; }

	Encoding::type encoding() const { return encoding_; }

	protected:
	explicit Decoder(const ColumnDescriptor* descr, Encoding::type encoding)
	: descr_(descr), encoding_(encoding), num_values_(0) {}

	// For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
	const ColumnDescriptor* descr_;

	const Encoding::type encoding_;
	int num_values_;
	};

	} // namespace parquet

	#endif // PARQUET_ENCODING_H