be/src/util/bit-packing.inline.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 // This contains all the template implementations for functions defined in bit-packing.h.
 // This should be included by files that want to instantiate those templates directly.
 // Including this file is not generally necessary - instead the templates should be
 // instantiated in bit-packing.cc so that compile times stay manageable.

 #pragma once

 #include "util/bit-packing.h"

 #include <algorithm>
 #include <type_traits>

 #include <boost/preprocessor/repetition/repeat_from_to.hpp>

 #include "common/compiler-util.h"
 #include "common/logging.h"
 #include "util/bit-util.h"

 namespace impala {

 inline int64_t BitPacking::NumValuesToUnpack(
     int bit_width, int64_t in_bytes, int64_t num_values) {
   // Check if we have enough input bytes to decode 'num_values'.
   if (bit_width == 0 || BitUtil::RoundUpNumBytes(num_values * bit_width) <= in_bytes) {
     // Limited by output space.
     return num_values;
   } else {
     // Limited by the number of input bytes. Compute the number of values that can be
     // unpacked from the input.
     return (in_bytes * CHAR_BIT) / bit_width;
   }
 }

 template <typename T>
 constexpr bool IsSupportedUnpackingType () {
   return std::is_same<T, uint8_t>::value
       || std::is_same<T, uint16_t>::value
       || std::is_same<T, uint32_t>::value
       || std::is_same<T, uint64_t>::value;
 }

 template <typename OutType>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
     const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
     OutType* __restrict__ out) {
   static_assert(IsSupportedUnpackingType<OutType>(),
       "Only unsigned integers are supported.");

 #pragma push_macro("UNPACK_VALUES_CASE")
 #define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
   case i:                                       \
     return UnpackValues<OutType, i>(in, in_bytes, num_values, out);

   switch (bit_width) {
     // Expand cases from 0 to 64.
     BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
     default:
       DCHECK(false);
       return std::make_pair(nullptr, -1);
   }
 #pragma pop_macro("UNPACK_VALUES_CASE")
 }

 template <typename OutType, int BIT_WIDTH>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(
     const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
     OutType* __restrict__ out) {
   static_assert(IsSupportedUnpackingType<OutType>(),
       "Only unsigned integers are supported.");

   constexpr int BATCH_SIZE = 32;
   const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
   const int64_t batches_to_read = values_to_read / BATCH_SIZE;
   const int64_t remainder_values = values_to_read % BATCH_SIZE;
   const uint8_t* in_pos = in;
   OutType* out_pos = out;

   // First unpack as many full batches as possible.
   for (int64_t i = 0; i < batches_to_read; ++i) {
     in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
     out_pos += BATCH_SIZE;
     in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
   }

   // Then unpack the final partial batch.
   if (remainder_values > 0) {
     in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(
         in_pos, in_bytes, remainder_values, out_pos);
   }
   return std::make_pair(in_pos, values_to_read);
 }

 template <typename OutType>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(int bit_width,
     const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
     int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
     bool* __restrict__ decode_error) {
 #pragma push_macro("UNPACK_VALUES_CASE")
 #define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
   case i:                                       \
     return UnpackAndDecodeValues<OutType, i>(   \
         in, in_bytes, dict, dict_len, num_values, out, stride, decode_error);

   switch (bit_width) {
     // Expand cases from 0 to MAX_DICT_BITWIDTH.
     BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
     default:
       DCHECK(false);
       return std::make_pair(nullptr, -1);
   }
 #pragma pop_macro("UNPACK_VALUES_CASE")
 }
 template <typename OutType, int BIT_WIDTH>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
     const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
     int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
     bool* __restrict__ decode_error) {
   constexpr int BATCH_SIZE = 32;
   const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
   const int64_t batches_to_read = values_to_read / BATCH_SIZE;
   const int64_t remainder_values = values_to_read % BATCH_SIZE;
   const uint8_t* in_pos = in;
   uint8_t* out_pos = reinterpret_cast<uint8_t*>(out);
   // First unpack as many full batches as possible.
   for (int64_t i = 0; i < batches_to_read; ++i) {
     in_pos = UnpackAndDecode32Values<OutType, BIT_WIDTH>(
         in_pos, in_bytes, dict, dict_len, reinterpret_cast<OutType*>(out_pos), stride,
         decode_error);
     out_pos += stride * BATCH_SIZE;
     in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
   }
   // Then unpack the final partial batch.
   if (remainder_values > 0) {
     in_pos = UnpackAndDecodeUpTo31Values<OutType, BIT_WIDTH>(
         in_pos, in_bytes, dict, dict_len, remainder_values,
         reinterpret_cast<OutType*>(out_pos), stride, decode_error);
   }
   return std::make_pair(in_pos, values_to_read);
 }

 // Loop body of unrolled loop that unpacks the value. BIT_WIDTH is the bit width of
 // the packed values. 'in_buf' is the start of the input buffer and 'out_vals' is the
 // start of the output values array. This function unpacks the VALUE_IDX'th packed value
 // from 'in_buf'.
 //
 // This implements essentially the same algorithm as the (Apache-licensed) code in
 // bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more compact
 // because it uses templates rather than source-level unrolling of all combinations.
 //
 // After the template parameters are expanded and constants are propagated, all branches
 // and offset/shift calculations should be optimized out, leaving only shifts by constants
 // and bitmasks by constants. Calls to this must be stamped out manually or with
 // BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 optimiser was
 // not able to fully propagate constants and remove branches when this was called from
 // inside a for loop with constant bounds with VALUE_IDX changed to a function argument.
 //
 // We compute how many 32 bit words we have to read, which is either 1, 2 or 3. If it is
 // at least 2, the first two 32 bit words are read as one 64 bit word. Even if only one
 // word needs to be read, we try to read 64 bits if it does not lead to buffer overflow
 // because benchmarks show that it has a positive effect on performance.
 //
 // If 'FULL_BATCH' is true, this function call is part of unpacking 32 values, otherwise
 // up to 31 values. This is needed to optimise the length of the reads (32 or 64 bits) and
 // avoid buffer overflow (if we are unpacking 32 values, we can safely assume an input
 // buffer of length 32 * BIT_WIDTH).
 template <int BIT_WIDTH, int VALUE_IDX, bool FULL_BATCH>
 inline uint64_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) {
   if (BIT_WIDTH == 0) return 0;

   constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
   constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
   constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
   constexpr int LAST_WORD_IDX = BitUtil::RoundUpNumi32(LAST_BIT_IDX);
   constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
   static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");

   constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
   constexpr uint64_t mask = BIT_WIDTH == 64 ? ~0L : (1UL << BIT_WIDTH) - 1;
   const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);

   // Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
   // this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
   // enough space in the buffer from the current reading point.
   // We try to read 64 bits even when it is not necessary because the benchmarks show it
   // is faster.
   constexpr bool CAN_SAFELY_READ_64_BITS = FULL_BATCH
       && FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;

   // We do not try to read 64 bits when the bit width is a power of two (unless it is
   // necessary) because performance benchmarks show that it is better this way. This seems
   // to be due to compiler optimisation issues, so we can revisit it when we update the
   // compiler version.
   constexpr bool READ_32_BITS = WORDS_TO_READ == 1
       && (!CAN_SAFELY_READ_64_BITS || BitUtil::IsPowerOf2(BIT_WIDTH));

   if (READ_32_BITS) {
     uint32_t word = in[FIRST_WORD_IDX];
     word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
     return word & mask;
   }

   uint64_t word = *reinterpret_cast<const uint64_t*>(in + FIRST_WORD_IDX);
   word >>= FIRST_BIT_OFFSET;

   if (WORDS_TO_READ > 2) {
     constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
     uint64_t extra_word = in[FIRST_WORD_IDX + 2];
     word |= extra_word << USEFUL_BITS;
   }

   return word & mask;
 }

 template <typename OutType>
 inline void ALWAYS_INLINE DecodeValue(OutType* __restrict__ dict, int64_t dict_len,
     uint32_t idx, OutType* __restrict__ out_val, bool* __restrict__ decode_error) {
   if (UNLIKELY(idx >= dict_len)) {
     *decode_error = true;
   } else {
     // Use memcpy() because we can't assume sufficient alignment in some cases (e.g.
     // 16 byte decimals).
     memcpy(out_val, &dict[idx], sizeof(OutType));
   }
 }

 template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::Unpack32Values(
     const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ out) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
   static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
   DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
   constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);

   // Call UnpackValue for 0 <= i < 32.
 #pragma push_macro("UNPACK_VALUE_CALL")
 #define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
   out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));

   BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
   return in + BYTES_TO_READ;
 #pragma pop_macro("UNPACK_VALUE_CALL")
 }

 template <typename OutType>
 const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* __restrict__ in,
     int64_t in_bytes, OutType* __restrict__ out) {
 #pragma push_macro("UNPACK_VALUES_CASE")
 #define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
     case i: return Unpack32Values<OutType, i>(in, in_bytes, out);

   switch (bit_width) {
     // Expand cases from 0 to 64.
     BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
     default: DCHECK(false); return in;
   }
 #pragma pop_macro("UNPACK_VALUES_CASE")
 }

 template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::UnpackAndDecode32Values(const uint8_t* __restrict__ in,
     int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len,
     OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
   static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
   constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
   // TODO: this could be optimised further by using SIMD instructions.
   // https://lemire.me/blog/2016/08/25/faster-dictionary-decoding-with-simd-instructions/

   static_assert(BIT_WIDTH <= MAX_DICT_BITWIDTH,
       "Too high bit width for dictionary index.");

   // Call UnpackValue() and DecodeValue() for 0 <= i < 32.
 #pragma push_macro("DECODE_VALUE_CALL")
 #define DECODE_VALUE_CALL(ignore1, i, ignore2)               \
   {                                                          \
     uint32_t idx = UnpackValue<BIT_WIDTH, i, true>(in);            \
     uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + i * stride; \
     DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
   }

   BOOST_PP_REPEAT_FROM_TO(0, 32, DECODE_VALUE_CALL, ignore);
   return in + BYTES_TO_READ;
 #pragma pop_macro("DECODE_VALUE_CALL")
 }

 template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in,
     int64_t in_bytes, int num_values, OutType* __restrict__ out) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
   static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
   DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
   constexpr int MAX_BATCH_SIZE = 31;
   const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
   DCHECK_LE(num_values, MAX_BATCH_SIZE);

   // Make sure the buffer is at least 1 byte.
   constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ?
     (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
   uint8_t tmp_buffer[TMP_BUFFER_SIZE];

   const uint8_t* in_buffer = in;
   // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
   // last 32-bit load would go past the end of the buffer.
   if (BitUtil::RoundUp(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
     memcpy(tmp_buffer, in, BYTES_TO_READ);
     in_buffer = tmp_buffer;
   }

 #pragma push_macro("UNPACK_VALUES_CASE")
 #define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
   case 31 - i: out[30 - i] = \
       static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer));

   // Use switch with fall-through cases to minimise branching.
   switch (num_values) {
   // Expand cases from 31 down to 1.
     BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
     case 0: break;
     default: DCHECK(false);
   }
   return in + BYTES_TO_READ;
 #pragma pop_macro("UNPACK_VALUES_CASE")
 }

 template <typename OutType, int BIT_WIDTH>
 const uint8_t* BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict__ in,
       int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len, int num_values,
       OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) {
   static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
   static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
   constexpr int MAX_BATCH_SIZE = 31;
   const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
   DCHECK_GE(in_bytes, BYTES_TO_READ);
   DCHECK_LE(num_values, MAX_BATCH_SIZE);

   // Make sure the buffer is at least 1 byte.
   constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ?
     (BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
   uint8_t tmp_buffer[TMP_BUFFER_SIZE];

   const uint8_t* in_buffer = in;
   // Copy into padded temporary buffer to avoid reading past the end of 'in' if the
   // last 32-bit load would go past the end of the buffer.
   if (BitUtil::RoundUp(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
     memcpy(tmp_buffer, in, BYTES_TO_READ);
     in_buffer = tmp_buffer;
   }

 #pragma push_macro("DECODE_VALUES_CASE")
 #define DECODE_VALUES_CASE(ignore1, i, ignore2)                   \
   case 31 - i: {                                                  \
     uint32_t idx = UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer);     \
     uint8_t* out_pos = reinterpret_cast<uint8_t*>(out) + (30 - i) * stride; \
     DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
   }

   // Use switch with fall-through cases to minimise branching.
   switch (num_values) {
     // Expand cases from 31 down to 1.
     BOOST_PP_REPEAT_FROM_TO(0, 31, DECODE_VALUES_CASE, ignore);
     case 0:
       break;
     default:
       DCHECK(false);
   }
   return in + BYTES_TO_READ;
 #pragma pop_macro("DECODE_VALUES_CASE")
 }
 } // namespace impala
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	// This contains all the template implementations for functions defined in bit-packing.h.
	// This should be included by files that want to instantiate those templates directly.
	// Including this file is not generally necessary - instead the templates should be
	// instantiated in bit-packing.cc so that compile times stay manageable.

	#pragma once

	#include "util/bit-packing.h"

	#include <algorithm>
	#include <type_traits>

	#include <boost/preprocessor/repetition/repeat_from_to.hpp>

	#include "common/compiler-util.h"
	#include "common/logging.h"
	#include "util/bit-util.h"

	namespace impala {

	inline int64_t BitPacking::NumValuesToUnpack(
	int bit_width, int64_t in_bytes, int64_t num_values) {
	// Check if we have enough input bytes to decode 'num_values'.
	if (bit_width == 0 \|\| BitUtil::RoundUpNumBytes(num_values * bit_width) <= in_bytes) {
	// Limited by output space.
	return num_values;
	} else {
	// Limited by the number of input bytes. Compute the number of values that can be
	// unpacked from the input.
	return (in_bytes * CHAR_BIT) / bit_width;
	}
	}

	template <typename T>
	constexpr bool IsSupportedUnpackingType () {
	return std::is_same<T, uint8_t>::value
	\|\| std::is_same<T, uint16_t>::value
	\|\| std::is_same<T, uint32_t>::value
	\|\| std::is_same<T, uint64_t>::value;
	}

	template <typename OutType>
	std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
	const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
	OutType* __restrict__ out) {
	static_assert(IsSupportedUnpackingType<OutType>(),
	"Only unsigned integers are supported.");

	#pragma push_macro("UNPACK_VALUES_CASE")
	#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
	case i: \
	return UnpackValues<OutType, i>(in, in_bytes, num_values, out);

	switch (bit_width) {
	// Expand cases from 0 to 64.
	BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
	default:
	DCHECK(false);
	return std::make_pair(nullptr, -1);
	}
	#pragma pop_macro("UNPACK_VALUES_CASE")
	}

	template <typename OutType, int BIT_WIDTH>
	std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(
	const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
	OutType* __restrict__ out) {
	static_assert(IsSupportedUnpackingType<OutType>(),
	"Only unsigned integers are supported.");

	constexpr int BATCH_SIZE = 32;
	const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
	const int64_t batches_to_read = values_to_read / BATCH_SIZE;
	const int64_t remainder_values = values_to_read % BATCH_SIZE;
	const uint8_t* in_pos = in;
	OutType* out_pos = out;

	// First unpack as many full batches as possible.
	for (int64_t i = 0; i < batches_to_read; ++i) {
	in_pos = Unpack32Values<OutType, BIT_WIDTH>(in_pos, in_bytes, out_pos);
	out_pos += BATCH_SIZE;
	in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
	}

	// Then unpack the final partial batch.
	if (remainder_values > 0) {
	in_pos = UnpackUpTo31Values<OutType, BIT_WIDTH>(
	in_pos, in_bytes, remainder_values, out_pos);
	}
	return std::make_pair(in_pos, values_to_read);
	}

	template <typename OutType>
	std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(int bit_width,
	const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
	int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
	bool* __restrict__ decode_error) {
	#pragma push_macro("UNPACK_VALUES_CASE")
	#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
	case i: \
	return UnpackAndDecodeValues<OutType, i>( \
	in, in_bytes, dict, dict_len, num_values, out, stride, decode_error);

	switch (bit_width) {
	// Expand cases from 0 to MAX_DICT_BITWIDTH.
	BOOST_PP_REPEAT_FROM_TO(0, 33, UNPACK_VALUES_CASE, ignore);
	default:
	DCHECK(false);
	return std::make_pair(nullptr, -1);
	}
	#pragma pop_macro("UNPACK_VALUES_CASE")
	}
	template <typename OutType, int BIT_WIDTH>
	std::pair<const uint8_t*, int64_t> BitPacking::UnpackAndDecodeValues(
	const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ dict,
	int64_t dict_len, int64_t num_values, OutType* __restrict__ out, int64_t stride,
	bool* __restrict__ decode_error) {
	constexpr int BATCH_SIZE = 32;
	const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
	const int64_t batches_to_read = values_to_read / BATCH_SIZE;
	const int64_t remainder_values = values_to_read % BATCH_SIZE;
	const uint8_t* in_pos = in;
	uint8_t* out_pos = reinterpret_cast<uint8_t*>(out);
	// First unpack as many full batches as possible.
	for (int64_t i = 0; i < batches_to_read; ++i) {
	in_pos = UnpackAndDecode32Values<OutType, BIT_WIDTH>(
	in_pos, in_bytes, dict, dict_len, reinterpret_cast<OutType*>(out_pos), stride,
	decode_error);
	out_pos += stride * BATCH_SIZE;
	in_bytes -= (BATCH_SIZE * BIT_WIDTH) / CHAR_BIT;
	}
	// Then unpack the final partial batch.
	if (remainder_values > 0) {
	in_pos = UnpackAndDecodeUpTo31Values<OutType, BIT_WIDTH>(
	in_pos, in_bytes, dict, dict_len, remainder_values,
	reinterpret_cast<OutType*>(out_pos), stride, decode_error);
	}
	return std::make_pair(in_pos, values_to_read);
	}

	// Loop body of unrolled loop that unpacks the value. BIT_WIDTH is the bit width of
	// the packed values. 'in_buf' is the start of the input buffer and 'out_vals' is the
	// start of the output values array. This function unpacks the VALUE_IDX'th packed value
	// from 'in_buf'.
	//
	// This implements essentially the same algorithm as the (Apache-licensed) code in
	// bpacking.c at https://github.com/lemire/FrameOfReference/, but is much more compact
	// because it uses templates rather than source-level unrolling of all combinations.
	//
	// After the template parameters are expanded and constants are propagated, all branches
	// and offset/shift calculations should be optimized out, leaving only shifts by constants
	// and bitmasks by constants. Calls to this must be stamped out manually or with
	// BOOST_PP_REPEAT_FROM_TO: experimentation revealed that the GCC 4.9.2 optimiser was
	// not able to fully propagate constants and remove branches when this was called from
	// inside a for loop with constant bounds with VALUE_IDX changed to a function argument.
	//
	// We compute how many 32 bit words we have to read, which is either 1, 2 or 3. If it is
	// at least 2, the first two 32 bit words are read as one 64 bit word. Even if only one
	// word needs to be read, we try to read 64 bits if it does not lead to buffer overflow
	// because benchmarks show that it has a positive effect on performance.
	//
	// If 'FULL_BATCH' is true, this function call is part of unpacking 32 values, otherwise
	// up to 31 values. This is needed to optimise the length of the reads (32 or 64 bits) and
	// avoid buffer overflow (if we are unpacking 32 values, we can safely assume an input
	// buffer of length 32 * BIT_WIDTH).
	template <int BIT_WIDTH, int VALUE_IDX, bool FULL_BATCH>
	inline uint64_t ALWAYS_INLINE UnpackValue(const uint8_t* __restrict__ in_buf) {
	if (BIT_WIDTH == 0) return 0;

	constexpr int FIRST_BIT_IDX = VALUE_IDX * BIT_WIDTH;
	constexpr int FIRST_WORD_IDX = FIRST_BIT_IDX / 32;
	constexpr int LAST_BIT_IDX = FIRST_BIT_IDX + BIT_WIDTH;
	constexpr int LAST_WORD_IDX = BitUtil::RoundUpNumi32(LAST_BIT_IDX);
	constexpr int WORDS_TO_READ = LAST_WORD_IDX - FIRST_WORD_IDX;
	static_assert(WORDS_TO_READ <= 3, "At most three 32-bit words need to be loaded.");

	constexpr int FIRST_BIT_OFFSET = FIRST_BIT_IDX - FIRST_WORD_IDX * 32;
	constexpr uint64_t mask = BIT_WIDTH == 64 ? ~0L : (1UL << BIT_WIDTH) - 1;
	const uint32_t* const in = reinterpret_cast<const uint32_t*>(in_buf);

	// Avoid reading past the end of the buffer. We can safely read 64 bits if we know that
	// this is a full batch read (so the input buffer is 32 * BIT_WIDTH long) and there is
	// enough space in the buffer from the current reading point.
	// We try to read 64 bits even when it is not necessary because the benchmarks show it
	// is faster.
	constexpr bool CAN_SAFELY_READ_64_BITS = FULL_BATCH
	&& FIRST_BIT_IDX - FIRST_BIT_OFFSET + 64 <= BIT_WIDTH * 32;

	// We do not try to read 64 bits when the bit width is a power of two (unless it is
	// necessary) because performance benchmarks show that it is better this way. This seems
	// to be due to compiler optimisation issues, so we can revisit it when we update the
	// compiler version.
	constexpr bool READ_32_BITS = WORDS_TO_READ == 1
	&& (!CAN_SAFELY_READ_64_BITS \|\| BitUtil::IsPowerOf2(BIT_WIDTH));

	if (READ_32_BITS) {
	uint32_t word = in[FIRST_WORD_IDX];
	word >>= FIRST_BIT_OFFSET < 32 ? FIRST_BIT_OFFSET : 0;
	return word & mask;
	}

	uint64_t word = reinterpret_cast<const uint64_t>(in + FIRST_WORD_IDX);
	word >>= FIRST_BIT_OFFSET;

	if (WORDS_TO_READ > 2) {
	constexpr int USEFUL_BITS = FIRST_BIT_OFFSET == 0 ? 0 : 64 - FIRST_BIT_OFFSET;
	uint64_t extra_word = in[FIRST_WORD_IDX + 2];
	word \|= extra_word << USEFUL_BITS;
	}

	return word & mask;
	}

	template <typename OutType>
	inline void ALWAYS_INLINE DecodeValue(OutType* __restrict__ dict, int64_t dict_len,
	uint32_t idx, OutType* __restrict__ out_val, bool* __restrict__ decode_error) {
	if (UNLIKELY(idx >= dict_len)) {
	*decode_error = true;
	} else {
	// Use memcpy() because we can't assume sufficient alignment in some cases (e.g.
	// 16 byte decimals).
	memcpy(out_val, &dict[idx], sizeof(OutType));
	}
	}

	template <typename OutType, int BIT_WIDTH>
	const uint8_t* BitPacking::Unpack32Values(
	const uint8_t* __restrict__ in, int64_t in_bytes, OutType* __restrict__ out) {
	static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
	static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
	DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
	constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
	DCHECK_GE(in_bytes, BYTES_TO_READ);

	// Call UnpackValue for 0 <= i < 32.
	#pragma push_macro("UNPACK_VALUE_CALL")
	#define UNPACK_VALUE_CALL(ignore1, i, ignore2) \
	out[i] = static_cast<OutType>(UnpackValue<BIT_WIDTH, i, true>(in));

	BOOST_PP_REPEAT_FROM_TO(0, 32, UNPACK_VALUE_CALL, ignore);
	return in + BYTES_TO_READ;
	#pragma pop_macro("UNPACK_VALUE_CALL")
	}

	template <typename OutType>
	const uint8_t* BitPacking::Unpack32Values(int bit_width, const uint8_t* __restrict__ in,
	int64_t in_bytes, OutType* __restrict__ out) {
	#pragma push_macro("UNPACK_VALUES_CASE")
	#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
	case i: return Unpack32Values<OutType, i>(in, in_bytes, out);

	switch (bit_width) {
	// Expand cases from 0 to 64.
	BOOST_PP_REPEAT_FROM_TO(0, 65, UNPACK_VALUES_CASE, ignore);
	default: DCHECK(false); return in;
	}
	#pragma pop_macro("UNPACK_VALUES_CASE")
	}

	template <typename OutType, int BIT_WIDTH>
	const uint8_t* BitPacking::UnpackAndDecode32Values(const uint8_t* __restrict__ in,
	int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len,
	OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) {
	static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
	static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
	constexpr int BYTES_TO_READ = BitUtil::RoundUpNumBytes(32 * BIT_WIDTH);
	DCHECK_GE(in_bytes, BYTES_TO_READ);
	// TODO: this could be optimised further by using SIMD instructions.
	// https://lemire.me/blog/2016/08/25/faster-dictionary-decoding-with-simd-instructions/

	static_assert(BIT_WIDTH <= MAX_DICT_BITWIDTH,
	"Too high bit width for dictionary index.");

	// Call UnpackValue() and DecodeValue() for 0 <= i < 32.
	#pragma push_macro("DECODE_VALUE_CALL")
	#define DECODE_VALUE_CALL(ignore1, i, ignore2) \
	{ \
	uint32_t idx = UnpackValue<BIT_WIDTH, i, true>(in); \
	uint8_t* out_pos = reinterpret_cast<uint8_t>(out) + i stride; \
	DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
	}

	BOOST_PP_REPEAT_FROM_TO(0, 32, DECODE_VALUE_CALL, ignore);
	return in + BYTES_TO_READ;
	#pragma pop_macro("DECODE_VALUE_CALL")
	}

	template <typename OutType, int BIT_WIDTH>
	const uint8_t* BitPacking::UnpackUpTo31Values(const uint8_t* __restrict__ in,
	int64_t in_bytes, int num_values, OutType* __restrict__ out) {
	static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
	static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
	DCHECK_LE(BIT_WIDTH, sizeof(OutType) * CHAR_BIT) << "BIT_WIDTH too high for output";
	constexpr int MAX_BATCH_SIZE = 31;
	const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
	DCHECK_GE(in_bytes, BYTES_TO_READ);
	DCHECK_LE(num_values, MAX_BATCH_SIZE);

	// Make sure the buffer is at least 1 byte.
	constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ?
	(BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
	uint8_t tmp_buffer[TMP_BUFFER_SIZE];

	const uint8_t* in_buffer = in;
	// Copy into padded temporary buffer to avoid reading past the end of 'in' if the
	// last 32-bit load would go past the end of the buffer.
	if (BitUtil::RoundUp(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
	memcpy(tmp_buffer, in, BYTES_TO_READ);
	in_buffer = tmp_buffer;
	}

	#pragma push_macro("UNPACK_VALUES_CASE")
	#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
	case 31 - i: out[30 - i] = \
	static_cast<OutType>(UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer));

	// Use switch with fall-through cases to minimise branching.
	switch (num_values) {
	// Expand cases from 31 down to 1.
	BOOST_PP_REPEAT_FROM_TO(0, 31, UNPACK_VALUES_CASE, ignore);
	case 0: break;
	default: DCHECK(false);
	}
	return in + BYTES_TO_READ;
	#pragma pop_macro("UNPACK_VALUES_CASE")
	}

	template <typename OutType, int BIT_WIDTH>
	const uint8_t* BitPacking::UnpackAndDecodeUpTo31Values(const uint8_t* __restrict__ in,
	int64_t in_bytes, OutType* __restrict__ dict, int64_t dict_len, int num_values,
	OutType* __restrict__ out, int64_t stride, bool* __restrict__ decode_error) {
	static_assert(BIT_WIDTH >= 0, "BIT_WIDTH too low");
	static_assert(BIT_WIDTH <= MAX_BITWIDTH, "BIT_WIDTH too high");
	constexpr int MAX_BATCH_SIZE = 31;
	const int BYTES_TO_READ = BitUtil::RoundUpNumBytes(num_values * BIT_WIDTH);
	DCHECK_GE(in_bytes, BYTES_TO_READ);
	DCHECK_LE(num_values, MAX_BATCH_SIZE);

	// Make sure the buffer is at least 1 byte.
	constexpr int TMP_BUFFER_SIZE = BIT_WIDTH ?
	(BIT_WIDTH * (MAX_BATCH_SIZE + 1)) / CHAR_BIT : 1;
	uint8_t tmp_buffer[TMP_BUFFER_SIZE];

	const uint8_t* in_buffer = in;
	// Copy into padded temporary buffer to avoid reading past the end of 'in' if the
	// last 32-bit load would go past the end of the buffer.
	if (BitUtil::RoundUp(BYTES_TO_READ, sizeof(uint32_t)) > in_bytes) {
	memcpy(tmp_buffer, in, BYTES_TO_READ);
	in_buffer = tmp_buffer;
	}

	#pragma push_macro("DECODE_VALUES_CASE")
	#define DECODE_VALUES_CASE(ignore1, i, ignore2) \
	case 31 - i: { \
	uint32_t idx = UnpackValue<BIT_WIDTH, 30 - i, false>(in_buffer); \
	uint8_t* out_pos = reinterpret_cast<uint8_t>(out) + (30 - i) stride; \
	DecodeValue(dict, dict_len, idx, reinterpret_cast<OutType*>(out_pos), decode_error); \
	}

	// Use switch with fall-through cases to minimise branching.
	switch (num_values) {
	// Expand cases from 31 down to 1.
	BOOST_PP_REPEAT_FROM_TO(0, 31, DECODE_VALUES_CASE, ignore);
	case 0:
	break;
	default:
	DCHECK(false);
	}
	return in + BYTES_TO_READ;
	#pragma pop_macro("DECODE_VALUES_CASE")
	}
	} // namespace impala