blob: f54e072ef13befe5eb244b3ece70178e08cabe47 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cstdlib>
#include "MemoryOutputStream.hh"
#include "RLEv1.hh"
#include "wrap/orc-proto-wrapper.hh"
#include "wrap/gtest-wrapper.h"
#ifdef __clang__
DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations")
#endif
namespace orc {
using ::testing::TestWithParam;
using ::testing::Values;
const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
class RleTest : public TestWithParam<bool> {
virtual void SetUp();
protected:
bool alignBitpacking;
std::unique_ptr<RleEncoder> getEncoder(RleVersion version,
MemoryOutputStream& memStream,
bool isSigned);
void runExampleTest(int64_t* inputData, uint64_t inputLength,
unsigned char* expectedOutput, uint64_t outputLength);
void runTest(RleVersion version,
uint64_t numValues,
int64_t start,
int64_t delta,
bool random,
bool isSigned,
uint64_t numNulls = 0);
};
void RleTest::SetUp() {
alignBitpacking = GetParam();
}
void generateData(
uint64_t numValues,
int64_t start,
int64_t delta,
bool random,
int64_t* data,
uint64_t numNulls = 0,
char* notNull = nullptr) {
if (numNulls != 0 && notNull != nullptr) {
memset(notNull, 1, numValues);
while (numNulls > 0) {
uint64_t pos = static_cast<uint64_t>(std::rand()) % numValues;
if (notNull[pos]) {
notNull[pos] = static_cast<char>(0);
--numNulls;
}
}
}
for (uint64_t i = 0; i < numValues; ++i) {
if (notNull == nullptr || notNull[i])
{
if (!random) {
data[i] = start + delta * static_cast<int64_t>(i);
} else {
data[i] = std::rand();
}
}
}
}
void decodeAndVerify(
RleVersion version,
const MemoryOutputStream& memStream,
int64_t * data,
uint64_t numValues,
const char* notNull,
bool isSinged) {
std::unique_ptr<RleDecoder> decoder = createRleDecoder(
std::unique_ptr<SeekableArrayInputStream>(new SeekableArrayInputStream(
memStream.getData(),
memStream.getLength())),
isSinged, version, *getDefaultPool());
int64_t* decodedData = new int64_t[numValues];
decoder->next(decodedData, numValues, notNull);
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
EXPECT_EQ(data[i], decodedData[i]);
}
}
delete [] decodedData;
}
std::unique_ptr<RleEncoder> RleTest::getEncoder(RleVersion version,
MemoryOutputStream& memStream,
bool isSigned)
{
MemoryPool * pool = getDefaultPool();
return createRleEncoder(
std::unique_ptr<BufferedOutputStream>(
new BufferedOutputStream(*pool, &memStream, 500 * 1024, 1024)),
isSigned, version, *pool, alignBitpacking);
}
void RleTest::runExampleTest(int64_t* inputData,
uint64_t inputLength,
unsigned char* expectedOutput,
uint64_t outputLength) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<RleEncoder> encoder = getEncoder(RleVersion_2, memStream, false);
encoder->add(inputData, inputLength, nullptr);
encoder->flush();
const char* output = memStream.getData();
for(int i = 0; i < outputLength; i++) {
EXPECT_EQ(expectedOutput[i], static_cast<unsigned char>(output[i]));
}
}
void RleTest::runTest(RleVersion version,
uint64_t numValues,
int64_t start,
int64_t delta,
bool random,
bool isSigned,
uint64_t numNulls) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<RleEncoder> encoder = getEncoder(version, memStream, isSigned);
char* notNull = numNulls == 0 ? nullptr : new char[numValues];
int64_t* data = new int64_t[numValues];
generateData(numValues, start, delta, random, data, numNulls, notNull);
encoder->add(data, numValues, notNull);
encoder->flush();
decodeAndVerify(version, memStream, data, numValues, notNull, isSigned);
delete [] data;
delete [] notNull;
}
TEST_P(RleTest, RleV1_delta_increasing_sequance_unsigned) {
runTest(RleVersion_1, 1024, 0, 1, false, false);
}
TEST_P(RleTest, RleV1_delta_increasing_sequance_unsigned_null) {
runTest(RleVersion_1, 1024, 0, 1, false, false, 100);
}
TEST_P(RleTest, RleV1_delta_decreasing_sequance_unsigned) {
runTest(RleVersion_1, 1024, 5000, -3, false, false);
}
TEST_P(RleTest, RleV1_delta_decreasing_sequance_signed) {
runTest(RleVersion_1, 1024, 100, -3, false, true);
}
TEST_P(RleTest, RleV1_delta_decreasing_sequance_signed_null) {
runTest(RleVersion_1, 1024, 100, -3, false, true, 500);
}
TEST_P(RleTest, rRleV1_andom_sequance_signed) {
runTest(RleVersion_1, 1024, 0, 0, true, true);
}
TEST_P(RleTest, RleV1_all_null) {
runTest(RleVersion_1, 1024, 100, -3, false, true, 1024);
}
TEST_P(RleTest, RleV2_delta_increasing_sequance_unsigned) {
runTest(RleVersion_2, 1024, 0, 1, false, false);
}
TEST_P(RleTest, RleV2_delta_increasing_sequance_unsigned_null) {
runTest(RleVersion_2, 1024, 0, 1, false, false, 100);
}
TEST_P(RleTest, RleV2_delta_decreasing_sequance_unsigned) {
runTest(RleVersion_2, 1024, 5000, -3, false, false);
}
TEST_P(RleTest, RleV2_delta_decreasing_sequance_signed) {
runTest(RleVersion_2, 1024, 100, -3, false, true);
}
TEST_P(RleTest, RleV2_delta_decreasing_sequance_signed_null) {
runTest(RleVersion_2, 1024, 100, -3, false, true, 500);
}
TEST_P(RleTest, RleV2_random_sequance_signed) {
runTest(RleVersion_2, 1024, 0, 0, true, true);
}
TEST_P(RleTest, RleV2_all_null) {
runTest(RleVersion_2, 1024, 100, -3, false, true, 1024);
}
TEST_P(RleTest, RleV2_delta_zero_unsigned) {
runTest(RleVersion_2, 1024, 123, 0, false, false);
}
TEST_P(RleTest, RleV2_delta_zero_signed) {
runTest(RleVersion_2, 1024, 123, 0, false, true);
}
TEST_P(RleTest, RleV2_short_repeat) {
runTest(RleVersion_2, 8, 123, 0, false, false);
}
TEST_P(RleTest, RleV2_short_repeat_example) {
int64_t data[5] = {10000, 10000, 10000, 10000, 10000};
unsigned char expectedEncoded[3] = {0x0a, 0x27, 0x10};
runExampleTest(data, 5, expectedEncoded, 3);
}
TEST_P(RleTest, RleV2_direct_example) {
int64_t data[4] = {23713, 43806, 57005, 48879};
unsigned char expectedEncoded[10] = {0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef};
runExampleTest(data, 4, expectedEncoded, 10);
}
TEST_P(RleTest, RleV2_Patched_base_example) {
int64_t data[20] = {2030, 2000, 2020, 1000000, 2040, 2050, 2060, 2070, 2080,
2090, 2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190};
unsigned char expectedEncoded[28] = {0x8e, 0x13, 0x2b, 0x21, 0x07, 0xd0, 0x1e, 0x00, 0x14,
0x70, 0x28, 0x32, 0x3c, 0x46, 0x50, 0x5a, 0x64, 0x6e, 0x78, 0x82, 0x8c,
0x96, 0xa0, 0xaa, 0xb4, 0xbe, 0xfc, 0xe8};
runExampleTest(data, 20, expectedEncoded, 28);
}
TEST_P(RleTest, RleV2_Patched_base_big_gap) {
// The input data contains 512 values: data[0...510] are of a repeated pattern of (2, 1), and
// the last value (i.e. data[511]) is equal to 1024. This makes the last value to be the
// only patched value and the gap of this patch is 511.
const int numValues = 512;
int64_t data[512];
for (int i = 0; i < 511; i+=2) {
data[i] = 2;
data[i+1] = 1;
}
data[511] = 1024;
// Invoke the encoder.
const bool isSigned = true;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<RleEncoder> encoder = getEncoder(RleVersion_2, memStream, isSigned);
encoder->add(data, numValues, nullptr);
encoder->flush();
// Decode and verify.
decodeAndVerify(RleVersion_2, memStream, data, numValues, nullptr, isSigned);
}
TEST_P(RleTest, RleV2_delta_example) {
int64_t data[10] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29};
unsigned char expectedEncoded[8] = {0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46};
unsigned char unalignedExpectedEncoded[7] = {0xc4, 0x09, 0x02, 0x02, 0x4A, 0x28, 0xA6};
if (alignBitpacking)
{
runExampleTest(data, 10, expectedEncoded, 8);
}
else
{
runExampleTest(data, 10, unalignedExpectedEncoded, 7);
}
}
TEST_P(RleTest, RleV2_delta_example2) {
int64_t data[7] = {0, 10000, 10001, 10001, 10002, 10003, 10003};
unsigned char expectedEncoded[8] = {0xc2, 0x06, 0x0, 0xa0, 0x9c, 0x01, 0x45, 0x0};
runExampleTest(data, 7, expectedEncoded, 8);
}
TEST_P(RleTest, RleV2_direct_repeat_example2) {
int64_t data[9] = {23713, 43806, 57005, 48879, 10000, 10000, 10000, 10000, 10000};
unsigned char expectedEncoded[13] = {0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef, 0x0a, 0x27, 0x10};
runExampleTest(data, 9, expectedEncoded, 13);
}
INSTANTIATE_TEST_CASE_P(OrcTest, RleTest, Values(true, false));
}