cpp/src/arrow/util/rle_encoding_test.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 // From Apache Impala (incubating) as of 2016-01-29

 #include <cstdint>
 #include <cstring>
 #include <random>
 #include <vector>

 #include <gtest/gtest.h>

 #include "arrow/array.h"
 #include "arrow/buffer.h"
 #include "arrow/testing/random.h"
 #include "arrow/type.h"
 #include "arrow/util/bit_stream_utils.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/io_util.h"
 #include "arrow/util/rle_encoding.h"

 namespace arrow {
 namespace util {

 const int MAX_WIDTH = 32;

 TEST(BitArray, TestBool) {
   const int len = 8;
   uint8_t buffer[len];

   BitUtil::BitWriter writer(buffer, len);

   // Write alternating 0's and 1's
   for (int i = 0; i < 8; ++i) {
     EXPECT_TRUE(writer.PutValue(i % 2, 1));
   }
   writer.Flush();

   EXPECT_EQ(buffer[0], 0xAA /* 0b10101010 */);

   // Write 00110011
   for (int i = 0; i < 8; ++i) {
     bool result = false;
     switch (i) {
       case 0:
       case 1:
       case 4:
       case 5:
         result = writer.PutValue(false, 1);
         break;
       default:
         result = writer.PutValue(true, 1);
         break;
     }
     EXPECT_TRUE(result);
   }
   writer.Flush();

   // Validate the exact bit value
   EXPECT_EQ(buffer[0], 0xAA /* 0b10101010 */);
   EXPECT_EQ(buffer[1], 0xCC /* 0b11001100 */);

   // Use the reader and validate
   BitUtil::BitReader reader(buffer, len);
   for (int i = 0; i < 8; ++i) {
     bool val = false;
     bool result = reader.GetValue(1, &val);
     EXPECT_TRUE(result);
     EXPECT_EQ(val, (i % 2) != 0);
   }

   for (int i = 0; i < 8; ++i) {
     bool val = false;
     bool result = reader.GetValue(1, &val);
     EXPECT_TRUE(result);
     switch (i) {
       case 0:
       case 1:
       case 4:
       case 5:
         EXPECT_EQ(val, false);
         break;
       default:
         EXPECT_EQ(val, true);
         break;
     }
   }
 }

 // Writes 'num_vals' values with width 'bit_width' and reads them back.
 void TestBitArrayValues(int bit_width, int num_vals) {
   int len = static_cast<int>(BitUtil::BytesForBits(bit_width * num_vals));
   EXPECT_GT(len, 0);
   const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;

   std::vector<uint8_t> buffer(len);
   BitUtil::BitWriter writer(buffer.data(), len);
   for (int i = 0; i < num_vals; ++i) {
     bool result = writer.PutValue(i % mod, bit_width);
     EXPECT_TRUE(result);
   }
   writer.Flush();
   EXPECT_EQ(writer.bytes_written(), len);

   BitUtil::BitReader reader(buffer.data(), len);
   for (int i = 0; i < num_vals; ++i) {
     int64_t val = 0;
     bool result = reader.GetValue(bit_width, &val);
     EXPECT_TRUE(result);
     EXPECT_EQ(val, i % mod);
   }
   EXPECT_EQ(reader.bytes_left(), 0);
 }

 TEST(BitArray, TestValues) {
   for (int width = 1; width <= MAX_WIDTH; ++width) {
     TestBitArrayValues(width, 1);
     TestBitArrayValues(width, 2);
     // Don't write too many values
     TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
     TestBitArrayValues(width, 1024);
   }
 }

 // Test some mixed values
 TEST(BitArray, TestMixed) {
   const int len = 1024;
   uint8_t buffer[len];
   bool parity = true;

   BitUtil::BitWriter writer(buffer, len);
   for (int i = 0; i < len; ++i) {
     bool result;
     if (i % 2 == 0) {
       result = writer.PutValue(parity, 1);
       parity = !parity;
     } else {
       result = writer.PutValue(i, 10);
     }
     EXPECT_TRUE(result);
   }
   writer.Flush();

   parity = true;
   BitUtil::BitReader reader(buffer, len);
   for (int i = 0; i < len; ++i) {
     bool result;
     if (i % 2 == 0) {
       bool val;
       result = reader.GetValue(1, &val);
       EXPECT_EQ(val, parity);
       parity = !parity;
     } else {
       int val;
       result = reader.GetValue(10, &val);
       EXPECT_EQ(val, i);
     }
     EXPECT_TRUE(result);
   }
 }

 // Validates encoding of values by encoding and decoding them.  If
 // expected_encoding != NULL, also validates that the encoded buffer is
 // exactly 'expected_encoding'.
 // if expected_len is not -1, it will validate the encoded size is correct.
 void ValidateRle(const std::vector<int>& values, int bit_width,
                  uint8_t* expected_encoding, int expected_len) {
   const int len = 64 * 1024;
   uint8_t buffer[len];
   EXPECT_LE(expected_len, len);

   RleEncoder encoder(buffer, len, bit_width);
   for (size_t i = 0; i < values.size(); ++i) {
     bool result = encoder.Put(values[i]);
     EXPECT_TRUE(result);
   }
   int encoded_len = encoder.Flush();

   if (expected_len != -1) {
     EXPECT_EQ(encoded_len, expected_len);
   }
   if (expected_encoding != NULL) {
     EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0);
   }

   // Verify read
   {
     RleDecoder decoder(buffer, len, bit_width);
     for (size_t i = 0; i < values.size(); ++i) {
       uint64_t val;
       bool result = decoder.Get(&val);
       EXPECT_TRUE(result);
       EXPECT_EQ(values[i], val);
     }
   }

   // Verify batch read
   {
     RleDecoder decoder(buffer, len, bit_width);
     std::vector<int> values_read(values.size());
     ASSERT_EQ(values.size(),
               decoder.GetBatch(values_read.data(), static_cast<int>(values.size())));
     EXPECT_EQ(values, values_read);
   }
 }

 // A version of ValidateRle that round-trips the values and returns false if
 // the returned values are not all the same
 bool CheckRoundTrip(const std::vector<int>& values, int bit_width) {
   const int len = 64 * 1024;
   uint8_t buffer[len];
   RleEncoder encoder(buffer, len, bit_width);
   for (size_t i = 0; i < values.size(); ++i) {
     bool result = encoder.Put(values[i]);
     if (!result) {
       return false;
     }
   }
   int encoded_len = encoder.Flush();
   int out = 0;

   {
     RleDecoder decoder(buffer, encoded_len, bit_width);
     for (size_t i = 0; i < values.size(); ++i) {
       EXPECT_TRUE(decoder.Get(&out));
       if (values[i] != out) {
         return false;
       }
     }
   }

   // Verify batch read
   {
     RleDecoder decoder(buffer, encoded_len, bit_width);
     std::vector<int> values_read(values.size());
     if (static_cast<int>(values.size()) !=
         decoder.GetBatch(values_read.data(), static_cast<int>(values.size()))) {
       return false;
     }

     if (values != values_read) {
       return false;
     }
   }

   return true;
 }

 TEST(Rle, SpecificSequences) {
   const int len = 1024;
   uint8_t expected_buffer[len];
   std::vector<int> values;

   // Test 50 0' followed by 50 1's
   values.resize(100);
   for (int i = 0; i < 50; ++i) {
     values[i] = 0;
   }
   for (int i = 50; i < 100; ++i) {
     values[i] = 1;
   }

   // expected_buffer valid for bit width <= 1 byte
   expected_buffer[0] = (50 << 1);
   expected_buffer[1] = 0;
   expected_buffer[2] = (50 << 1);
   expected_buffer[3] = 1;
   for (int width = 1; width <= 8; ++width) {
     ValidateRle(values, width, expected_buffer, 4);
   }

   for (int width = 9; width <= MAX_WIDTH; ++width) {
     ValidateRle(values, width, nullptr,
                 2 * (1 + static_cast<int>(BitUtil::CeilDiv(width, 8))));
   }

   // Test 100 0's and 1's alternating
   for (int i = 0; i < 100; ++i) {
     values[i] = i % 2;
   }
   int num_groups = static_cast<int>(BitUtil::CeilDiv(100, 8));
   expected_buffer[0] = static_cast<uint8_t>((num_groups << 1) | 1);
   for (int i = 1; i <= 100 / 8; ++i) {
     expected_buffer[i] = 0xAA /* 0b10101010 */;
   }
   // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
   expected_buffer[100 / 8 + 1] = 0x0A /* 0b00001010 */;

   // num_groups and expected_buffer only valid for bit width = 1
   ValidateRle(values, 1, expected_buffer, 1 + num_groups);
   for (int width = 2; width <= MAX_WIDTH; ++width) {
     int num_values = static_cast<int>(BitUtil::CeilDiv(100, 8)) * 8;
     ValidateRle(values, width, nullptr,
                 1 + static_cast<int>(BitUtil::CeilDiv(width * num_values, 8)));
   }

   // Test 16-bit values to confirm encoded values are stored in little endian
   values.resize(28);
   for (int i = 0; i < 16; ++i) {
     values[i] = 0x55aa;
   }
   for (int i = 16; i < 28; ++i) {
     values[i] = 0xaa55;
   }
   expected_buffer[0] = (16 << 1);
   expected_buffer[1] = 0xaa;
   expected_buffer[2] = 0x55;
   expected_buffer[3] = (12 << 1);
   expected_buffer[4] = 0x55;
   expected_buffer[5] = 0xaa;

   ValidateRle(values, 16, expected_buffer, 6);

   // Test 32-bit values to confirm encoded values are stored in little endian
   values.resize(28);
   for (int i = 0; i < 16; ++i) {
     values[i] = 0x555aaaa5;
   }
   for (int i = 16; i < 28; ++i) {
     values[i] = 0x5aaaa555;
   }
   expected_buffer[0] = (16 << 1);
   expected_buffer[1] = 0xa5;
   expected_buffer[2] = 0xaa;
   expected_buffer[3] = 0x5a;
   expected_buffer[4] = 0x55;
   expected_buffer[5] = (12 << 1);
   expected_buffer[6] = 0x55;
   expected_buffer[7] = 0xa5;
   expected_buffer[8] = 0xaa;
   expected_buffer[9] = 0x5a;

   ValidateRle(values, 32, expected_buffer, 10);
 }

 // ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
 // is used, otherwise alternating values are used.
 void TestRleValues(int bit_width, int num_vals, int value = -1) {
   const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
   std::vector<int> values;
   for (int v = 0; v < num_vals; ++v) {
     values.push_back((value != -1) ? value : static_cast<int>(v % mod));
   }
   ValidateRle(values, bit_width, NULL, -1);
 }

 TEST(Rle, TestValues) {
   for (int width = 1; width <= MAX_WIDTH; ++width) {
     TestRleValues(width, 1);
     TestRleValues(width, 1024);
     TestRleValues(width, 1024, 0);
     TestRleValues(width, 1024, 1);
   }
 }

 TEST(Rle, BitWidthZeroRepeated) {
   uint8_t buffer[1];
   const int num_values = 15;
   buffer[0] = num_values << 1;  // repeated indicator byte
   RleDecoder decoder(buffer, sizeof(buffer), 0);
   uint8_t val;
   for (int i = 0; i < num_values; ++i) {
     bool result = decoder.Get(&val);
     EXPECT_TRUE(result);
     EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
   }
   EXPECT_FALSE(decoder.Get(&val));
 }

 TEST(Rle, BitWidthZeroLiteral) {
   uint8_t buffer[1];
   const int num_groups = 4;
   buffer[0] = num_groups << 1 | 1;  // literal indicator byte
   RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
   const int num_values = num_groups * 8;
   uint8_t val;
   for (int i = 0; i < num_values; ++i) {
     bool result = decoder.Get(&val);
     EXPECT_TRUE(result);
     EXPECT_EQ(val, 0);  // can only encode 0s with bit width 0
   }
   EXPECT_FALSE(decoder.Get(&val));
 }

 // Test that writes out a repeated group and then a literal
 // group but flush before finishing.
 TEST(BitRle, Flush) {
   std::vector<int> values;
   for (int i = 0; i < 16; ++i) values.push_back(1);
   values.push_back(0);
   ValidateRle(values, 1, NULL, -1);
   values.push_back(1);
   ValidateRle(values, 1, NULL, -1);
   values.push_back(1);
   ValidateRle(values, 1, NULL, -1);
   values.push_back(1);
   ValidateRle(values, 1, NULL, -1);
 }

 // Test some random sequences.
 TEST(BitRle, Random) {
   int niters = 50;
   int ngroups = 1000;
   int max_group_size = 16;
   std::vector<int> values(ngroups + max_group_size);

   // prng setup
   const auto seed = ::arrow::internal::GetRandomSeed();
   std::default_random_engine gen(
       static_cast<std::default_random_engine::result_type>(seed));
   std::uniform_int_distribution<int> dist(1, 20);

   for (int iter = 0; iter < niters; ++iter) {
     // generate a seed with device entropy
     bool parity = 0;
     values.resize(0);

     for (int i = 0; i < ngroups; ++i) {
       int group_size = dist(gen);
       if (group_size > max_group_size) {
         group_size = 1;
       }
       for (int i = 0; i < group_size; ++i) {
         values.push_back(parity);
       }
       parity = !parity;
     }
     if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) {
       FAIL() << "failing seed: " << seed;
     }
   }
 }

 // Test a sequence of 1 0's, 2 1's, 3 0's. etc
 // e.g. 011000111100000
 TEST(BitRle, RepeatedPattern) {
   std::vector<int> values;
   const int min_run = 1;
   const int max_run = 32;

   for (int i = min_run; i <= max_run; ++i) {
     int v = i % 2;
     for (int j = 0; j < i; ++j) {
       values.push_back(v);
     }
   }

   // And go back down again
   for (int i = max_run; i >= min_run; --i) {
     int v = i % 2;
     for (int j = 0; j < i; ++j) {
       values.push_back(v);
     }
   }

   ValidateRle(values, 1, NULL, -1);
 }

 TEST(BitRle, Overflow) {
   for (int bit_width = 1; bit_width < 32; bit_width += 3) {
     int len = RleEncoder::MinBufferSize(bit_width);
     std::vector<uint8_t> buffer(len);
     int num_added = 0;
     bool parity = true;

     RleEncoder encoder(buffer.data(), len, bit_width);
     // Insert alternating true/false until there is no space left
     while (true) {
       bool result = encoder.Put(parity);
       parity = !parity;
       if (!result) break;
       ++num_added;
     }

     int bytes_written = encoder.Flush();
     EXPECT_LE(bytes_written, len);
     EXPECT_GT(num_added, 0);

     RleDecoder decoder(buffer.data(), bytes_written, bit_width);
     parity = true;
     uint32_t v;
     for (int i = 0; i < num_added; ++i) {
       bool result = decoder.Get(&v);
       EXPECT_TRUE(result);
       EXPECT_EQ(v != 0, parity);
       parity = !parity;
     }
     // Make sure we get false when reading past end a couple times.
     EXPECT_FALSE(decoder.Get(&v));
     EXPECT_FALSE(decoder.Get(&v));
   }
 }

 template <typename Type>
 void CheckRoundTripSpaced(const Array& data, int bit_width) {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using T = typename Type::c_type;

   int num_values = static_cast<int>(data.length());
   int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values);

   const T* values = static_cast<const ArrayType&>(data).raw_values();

   std::vector<uint8_t> buffer(buffer_size);
   RleEncoder encoder(buffer.data(), buffer_size, bit_width);
   for (int i = 0; i < num_values; ++i) {
     if (data.IsValid(i)) {
       if (!encoder.Put(static_cast<uint64_t>(values[i]))) {
         FAIL() << "Encoding failed";
       }
     }
   }
   int encoded_size = encoder.Flush();

   // Verify batch read
   RleDecoder decoder(buffer.data(), encoded_size, bit_width);
   std::vector<T> values_read(num_values);

   if (num_values != decoder.GetBatchSpaced(
                         num_values, static_cast<int>(data.null_count()),
                         data.null_bitmap_data(), data.offset(), values_read.data())) {
     FAIL();
   }

   for (int64_t i = 0; i < num_values; ++i) {
     if (data.IsValid(i)) {
       if (values_read[i] != values[i]) {
         FAIL() << "Index " << i << " read " << values_read[i] << " but should be "
                << values[i];
       }
     }
   }
 }

 template <typename T>
 struct GetBatchSpacedTestCase {
   T max_value;
   int64_t size;
   double null_probability;
   int bit_width;
 };

 TEST(RleDecoder, GetBatchSpaced) {
   uint32_t kSeed = 1337;
   ::arrow::random::RandomArrayGenerator rand(kSeed);

   std::vector<GetBatchSpacedTestCase<int32_t>> int32_cases{
       {1, 100000, 0.01, 1}, {1, 100000, 0.1, 1},    {1, 100000, 0.5, 1},
       {4, 100000, 0.05, 3}, {100, 100000, 0.05, 7},
   };
   for (auto case_ : int32_cases) {
     auto arr = rand.Int32(case_.size, /*min=*/0, case_.max_value, case_.null_probability);
     CheckRoundTripSpaced<Int32Type>(*arr, case_.bit_width);
     CheckRoundTripSpaced<Int32Type>(*arr->Slice(1), case_.bit_width);
   }
 }

 }  // namespace util
 }  // namespace arrow
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	// From Apache Impala (incubating) as of 2016-01-29

	#include <cstdint>
	#include <cstring>
	#include <random>
	#include <vector>

	#include <gtest/gtest.h>

	#include "arrow/array.h"
	#include "arrow/buffer.h"
	#include "arrow/testing/random.h"
	#include "arrow/type.h"
	#include "arrow/util/bit_stream_utils.h"
	#include "arrow/util/bit_util.h"
	#include "arrow/util/io_util.h"
	#include "arrow/util/rle_encoding.h"

	namespace arrow {
	namespace util {

	const int MAX_WIDTH = 32;

	TEST(BitArray, TestBool) {
	const int len = 8;
	uint8_t buffer[len];

	BitUtil::BitWriter writer(buffer, len);

	// Write alternating 0's and 1's
	for (int i = 0; i < 8; ++i) {
	EXPECT_TRUE(writer.PutValue(i % 2, 1));
	}
	writer.Flush();

	EXPECT_EQ(buffer[0], 0xAA /* 0b10101010 */);

	// Write 00110011
	for (int i = 0; i < 8; ++i) {
	bool result = false;
	switch (i) {
	case 0:
	case 1:
	case 4:
	case 5:
	result = writer.PutValue(false, 1);
	break;
	default:
	result = writer.PutValue(true, 1);
	break;
	}
	EXPECT_TRUE(result);
	}
	writer.Flush();

	// Validate the exact bit value
	EXPECT_EQ(buffer[0], 0xAA /* 0b10101010 */);
	EXPECT_EQ(buffer[1], 0xCC /* 0b11001100 */);

	// Use the reader and validate
	BitUtil::BitReader reader(buffer, len);
	for (int i = 0; i < 8; ++i) {
	bool val = false;
	bool result = reader.GetValue(1, &val);
	EXPECT_TRUE(result);
	EXPECT_EQ(val, (i % 2) != 0);
	}

	for (int i = 0; i < 8; ++i) {
	bool val = false;
	bool result = reader.GetValue(1, &val);
	EXPECT_TRUE(result);
	switch (i) {
	case 0:
	case 1:
	case 4:
	case 5:
	EXPECT_EQ(val, false);
	break;
	default:
	EXPECT_EQ(val, true);
	break;
	}
	}
	}

	// Writes 'num_vals' values with width 'bit_width' and reads them back.
	void TestBitArrayValues(int bit_width, int num_vals) {
	int len = static_cast<int>(BitUtil::BytesForBits(bit_width * num_vals));
	EXPECT_GT(len, 0);
	const uint64_t mod = bit_width == 64 ? 1 : 1LL << bit_width;

	std::vector<uint8_t> buffer(len);
	BitUtil::BitWriter writer(buffer.data(), len);
	for (int i = 0; i < num_vals; ++i) {
	bool result = writer.PutValue(i % mod, bit_width);
	EXPECT_TRUE(result);
	}
	writer.Flush();
	EXPECT_EQ(writer.bytes_written(), len);

	BitUtil::BitReader reader(buffer.data(), len);
	for (int i = 0; i < num_vals; ++i) {
	int64_t val = 0;
	bool result = reader.GetValue(bit_width, &val);
	EXPECT_TRUE(result);
	EXPECT_EQ(val, i % mod);
	}
	EXPECT_EQ(reader.bytes_left(), 0);
	}

	TEST(BitArray, TestValues) {
	for (int width = 1; width <= MAX_WIDTH; ++width) {
	TestBitArrayValues(width, 1);
	TestBitArrayValues(width, 2);
	// Don't write too many values
	TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
	TestBitArrayValues(width, 1024);
	}
	}

	// Test some mixed values
	TEST(BitArray, TestMixed) {
	const int len = 1024;
	uint8_t buffer[len];
	bool parity = true;

	BitUtil::BitWriter writer(buffer, len);
	for (int i = 0; i < len; ++i) {
	bool result;
	if (i % 2 == 0) {
	result = writer.PutValue(parity, 1);
	parity = !parity;
	} else {
	result = writer.PutValue(i, 10);
	}
	EXPECT_TRUE(result);
	}
	writer.Flush();

	parity = true;
	BitUtil::BitReader reader(buffer, len);
	for (int i = 0; i < len; ++i) {
	bool result;
	if (i % 2 == 0) {
	bool val;
	result = reader.GetValue(1, &val);
	EXPECT_EQ(val, parity);
	parity = !parity;
	} else {
	int val;
	result = reader.GetValue(10, &val);
	EXPECT_EQ(val, i);
	}
	EXPECT_TRUE(result);
	}
	}

	// Validates encoding of values by encoding and decoding them. If
	// expected_encoding != NULL, also validates that the encoded buffer is
	// exactly 'expected_encoding'.
	// if expected_len is not -1, it will validate the encoded size is correct.
	void ValidateRle(const std::vector<int>& values, int bit_width,
	uint8_t* expected_encoding, int expected_len) {
	const int len = 64 * 1024;
	uint8_t buffer[len];
	EXPECT_LE(expected_len, len);

	RleEncoder encoder(buffer, len, bit_width);
	for (size_t i = 0; i < values.size(); ++i) {
	bool result = encoder.Put(values[i]);
	EXPECT_TRUE(result);
	}
	int encoded_len = encoder.Flush();

	if (expected_len != -1) {
	EXPECT_EQ(encoded_len, expected_len);
	}
	if (expected_encoding != NULL) {
	EXPECT_EQ(memcmp(buffer, expected_encoding, encoded_len), 0);
	}

	// Verify read
	{
	RleDecoder decoder(buffer, len, bit_width);
	for (size_t i = 0; i < values.size(); ++i) {
	uint64_t val;
	bool result = decoder.Get(&val);
	EXPECT_TRUE(result);
	EXPECT_EQ(values[i], val);
	}
	}

	// Verify batch read
	{
	RleDecoder decoder(buffer, len, bit_width);
	std::vector<int> values_read(values.size());
	ASSERT_EQ(values.size(),
	decoder.GetBatch(values_read.data(), static_cast<int>(values.size())));
	EXPECT_EQ(values, values_read);
	}
	}

	// A version of ValidateRle that round-trips the values and returns false if
	// the returned values are not all the same
	bool CheckRoundTrip(const std::vector<int>& values, int bit_width) {
	const int len = 64 * 1024;
	uint8_t buffer[len];
	RleEncoder encoder(buffer, len, bit_width);
	for (size_t i = 0; i < values.size(); ++i) {
	bool result = encoder.Put(values[i]);
	if (!result) {
	return false;
	}
	}
	int encoded_len = encoder.Flush();
	int out = 0;

	{
	RleDecoder decoder(buffer, encoded_len, bit_width);
	for (size_t i = 0; i < values.size(); ++i) {
	EXPECT_TRUE(decoder.Get(&out));
	if (values[i] != out) {
	return false;
	}
	}
	}

	// Verify batch read
	{
	RleDecoder decoder(buffer, encoded_len, bit_width);
	std::vector<int> values_read(values.size());
	if (static_cast<int>(values.size()) !=
	decoder.GetBatch(values_read.data(), static_cast<int>(values.size()))) {
	return false;
	}

	if (values != values_read) {
	return false;
	}
	}

	return true;
	}

	TEST(Rle, SpecificSequences) {
	const int len = 1024;
	uint8_t expected_buffer[len];
	std::vector<int> values;

	// Test 50 0' followed by 50 1's
	values.resize(100);
	for (int i = 0; i < 50; ++i) {
	values[i] = 0;
	}
	for (int i = 50; i < 100; ++i) {
	values[i] = 1;
	}

	// expected_buffer valid for bit width <= 1 byte
	expected_buffer[0] = (50 << 1);
	expected_buffer[1] = 0;
	expected_buffer[2] = (50 << 1);
	expected_buffer[3] = 1;
	for (int width = 1; width <= 8; ++width) {
	ValidateRle(values, width, expected_buffer, 4);
	}

	for (int width = 9; width <= MAX_WIDTH; ++width) {
	ValidateRle(values, width, nullptr,
	2 * (1 + static_cast<int>(BitUtil::CeilDiv(width, 8))));
	}

	// Test 100 0's and 1's alternating
	for (int i = 0; i < 100; ++i) {
	values[i] = i % 2;
	}
	int num_groups = static_cast<int>(BitUtil::CeilDiv(100, 8));
	expected_buffer[0] = static_cast<uint8_t>((num_groups << 1) \| 1);
	for (int i = 1; i <= 100 / 8; ++i) {
	expected_buffer[i] = 0xAA /* 0b10101010 */;
	}
	// Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
	expected_buffer[100 / 8 + 1] = 0x0A /* 0b00001010 */;

	// num_groups and expected_buffer only valid for bit width = 1
	ValidateRle(values, 1, expected_buffer, 1 + num_groups);
	for (int width = 2; width <= MAX_WIDTH; ++width) {
	int num_values = static_cast<int>(BitUtil::CeilDiv(100, 8)) * 8;
	ValidateRle(values, width, nullptr,
	1 + static_cast<int>(BitUtil::CeilDiv(width * num_values, 8)));
	}

	// Test 16-bit values to confirm encoded values are stored in little endian
	values.resize(28);
	for (int i = 0; i < 16; ++i) {
	values[i] = 0x55aa;
	}
	for (int i = 16; i < 28; ++i) {
	values[i] = 0xaa55;
	}
	expected_buffer[0] = (16 << 1);
	expected_buffer[1] = 0xaa;
	expected_buffer[2] = 0x55;
	expected_buffer[3] = (12 << 1);
	expected_buffer[4] = 0x55;
	expected_buffer[5] = 0xaa;

	ValidateRle(values, 16, expected_buffer, 6);

	// Test 32-bit values to confirm encoded values are stored in little endian
	values.resize(28);
	for (int i = 0; i < 16; ++i) {
	values[i] = 0x555aaaa5;
	}
	for (int i = 16; i < 28; ++i) {
	values[i] = 0x5aaaa555;
	}
	expected_buffer[0] = (16 << 1);
	expected_buffer[1] = 0xa5;
	expected_buffer[2] = 0xaa;
	expected_buffer[3] = 0x5a;
	expected_buffer[4] = 0x55;
	expected_buffer[5] = (12 << 1);
	expected_buffer[6] = 0x55;
	expected_buffer[7] = 0xa5;
	expected_buffer[8] = 0xaa;
	expected_buffer[9] = 0x5a;

	ValidateRle(values, 32, expected_buffer, 10);
	}

	// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
	// is used, otherwise alternating values are used.
	void TestRleValues(int bit_width, int num_vals, int value = -1) {
	const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
	std::vector<int> values;
	for (int v = 0; v < num_vals; ++v) {
	values.push_back((value != -1) ? value : static_cast<int>(v % mod));
	}
	ValidateRle(values, bit_width, NULL, -1);
	}

	TEST(Rle, TestValues) {
	for (int width = 1; width <= MAX_WIDTH; ++width) {
	TestRleValues(width, 1);
	TestRleValues(width, 1024);
	TestRleValues(width, 1024, 0);
	TestRleValues(width, 1024, 1);
	}
	}

	TEST(Rle, BitWidthZeroRepeated) {
	uint8_t buffer[1];
	const int num_values = 15;
	buffer[0] = num_values << 1; // repeated indicator byte
	RleDecoder decoder(buffer, sizeof(buffer), 0);
	uint8_t val;
	for (int i = 0; i < num_values; ++i) {
	bool result = decoder.Get(&val);
	EXPECT_TRUE(result);
	EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
	}
	EXPECT_FALSE(decoder.Get(&val));
	}

	TEST(Rle, BitWidthZeroLiteral) {
	uint8_t buffer[1];
	const int num_groups = 4;
	buffer[0] = num_groups << 1 \| 1; // literal indicator byte
	RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
	const int num_values = num_groups * 8;
	uint8_t val;
	for (int i = 0; i < num_values; ++i) {
	bool result = decoder.Get(&val);
	EXPECT_TRUE(result);
	EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
	}
	EXPECT_FALSE(decoder.Get(&val));
	}

	// Test that writes out a repeated group and then a literal
	// group but flush before finishing.
	TEST(BitRle, Flush) {
	std::vector<int> values;
	for (int i = 0; i < 16; ++i) values.push_back(1);
	values.push_back(0);
	ValidateRle(values, 1, NULL, -1);
	values.push_back(1);
	ValidateRle(values, 1, NULL, -1);
	values.push_back(1);
	ValidateRle(values, 1, NULL, -1);
	values.push_back(1);
	ValidateRle(values, 1, NULL, -1);
	}

	// Test some random sequences.
	TEST(BitRle, Random) {
	int niters = 50;
	int ngroups = 1000;
	int max_group_size = 16;
	std::vector<int> values(ngroups + max_group_size);

	// prng setup
	const auto seed = ::arrow::internal::GetRandomSeed();
	std::default_random_engine gen(
	static_cast<std::default_random_engine::result_type>(seed));
	std::uniform_int_distribution<int> dist(1, 20);

	for (int iter = 0; iter < niters; ++iter) {
	// generate a seed with device entropy
	bool parity = 0;
	values.resize(0);

	for (int i = 0; i < ngroups; ++i) {
	int group_size = dist(gen);
	if (group_size > max_group_size) {
	group_size = 1;
	}
	for (int i = 0; i < group_size; ++i) {
	values.push_back(parity);
	}
	parity = !parity;
	}
	if (!CheckRoundTrip(values, BitUtil::NumRequiredBits(values.size()))) {
	FAIL() << "failing seed: " << seed;
	}
	}
	}

	// Test a sequence of 1 0's, 2 1's, 3 0's. etc
	// e.g. 011000111100000
	TEST(BitRle, RepeatedPattern) {
	std::vector<int> values;
	const int min_run = 1;
	const int max_run = 32;

	for (int i = min_run; i <= max_run; ++i) {
	int v = i % 2;
	for (int j = 0; j < i; ++j) {
	values.push_back(v);
	}
	}

	// And go back down again
	for (int i = max_run; i >= min_run; --i) {
	int v = i % 2;
	for (int j = 0; j < i; ++j) {
	values.push_back(v);
	}
	}

	ValidateRle(values, 1, NULL, -1);
	}

	TEST(BitRle, Overflow) {
	for (int bit_width = 1; bit_width < 32; bit_width += 3) {
	int len = RleEncoder::MinBufferSize(bit_width);
	std::vector<uint8_t> buffer(len);
	int num_added = 0;
	bool parity = true;

	RleEncoder encoder(buffer.data(), len, bit_width);
	// Insert alternating true/false until there is no space left
	while (true) {
	bool result = encoder.Put(parity);
	parity = !parity;
	if (!result) break;
	++num_added;
	}

	int bytes_written = encoder.Flush();
	EXPECT_LE(bytes_written, len);
	EXPECT_GT(num_added, 0);

	RleDecoder decoder(buffer.data(), bytes_written, bit_width);
	parity = true;
	uint32_t v;
	for (int i = 0; i < num_added; ++i) {
	bool result = decoder.Get(&v);
	EXPECT_TRUE(result);
	EXPECT_EQ(v != 0, parity);
	parity = !parity;
	}
	// Make sure we get false when reading past end a couple times.
	EXPECT_FALSE(decoder.Get(&v));
	EXPECT_FALSE(decoder.Get(&v));
	}
	}

	template <typename Type>
	void CheckRoundTripSpaced(const Array& data, int bit_width) {
	using ArrayType = typename TypeTraits<Type>::ArrayType;
	using T = typename Type::c_type;

	int num_values = static_cast<int>(data.length());
	int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values);

	const T* values = static_cast<const ArrayType&>(data).raw_values();

	std::vector<uint8_t> buffer(buffer_size);
	RleEncoder encoder(buffer.data(), buffer_size, bit_width);
	for (int i = 0; i < num_values; ++i) {
	if (data.IsValid(i)) {
	if (!encoder.Put(static_cast<uint64_t>(values[i]))) {
	FAIL() << "Encoding failed";
	}
	}
	}
	int encoded_size = encoder.Flush();

	// Verify batch read
	RleDecoder decoder(buffer.data(), encoded_size, bit_width);
	std::vector<T> values_read(num_values);

	if (num_values != decoder.GetBatchSpaced(
	num_values, static_cast<int>(data.null_count()),
	data.null_bitmap_data(), data.offset(), values_read.data())) {
	FAIL();
	}

	for (int64_t i = 0; i < num_values; ++i) {
	if (data.IsValid(i)) {
	if (values_read[i] != values[i]) {
	FAIL() << "Index " << i << " read " << values_read[i] << " but should be "
	<< values[i];
	}
	}
	}
	}

	template <typename T>
	struct GetBatchSpacedTestCase {
	T max_value;
	int64_t size;
	double null_probability;
	int bit_width;
	};

	TEST(RleDecoder, GetBatchSpaced) {
	uint32_t kSeed = 1337;
	::arrow::random::RandomArrayGenerator rand(kSeed);

	std::vector<GetBatchSpacedTestCase<int32_t>> int32_cases{
	{1, 100000, 0.01, 1}, {1, 100000, 0.1, 1}, {1, 100000, 0.5, 1},
	{4, 100000, 0.05, 3}, {100, 100000, 0.05, 7},
	};
	for (auto case_ : int32_cases) {
	auto arr = rand.Int32(case_.size, /min=/0, case_.max_value, case_.null_probability);
	CheckRoundTripSpaced<Int32Type>(*arr, case_.bit_width);
	CheckRoundTripSpaced<Int32Type>(*arr->Slice(1), case_.bit_width);
	}
	}

	} // namespace util
	} // namespace arrow