blob: 31b9998e2f8f73916d27145633c890e4365dad12 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exec/parquet/parquet-bool-decoder.h"
#include "util/mem-util.h"
#include "common/names.h"
namespace impala {
bool ParquetBoolDecoder::SetData(
parquet::Encoding::type encoding, uint8_t* data, int size) {
encoding_ = encoding;
// Only the relevant decoder is initialized for a given data page.
switch (encoding) {
case parquet::Encoding::PLAIN:
bool_values_.Reset(data, size);
break;
case parquet::Encoding::RLE:
// The first 4 bytes contain the size of the encoded data. This information is
// redundant, as this is the last part of the data page, and the number of
// remaining bytes is already known.
rle_decoder_.Reset(data + 4, size - 4, 1);
break;
default:
return false;
}
num_unpacked_values_ = 0;
unpacked_value_idx_ = 0;
return true;
}
bool ParquetBoolDecoder::DecodeValues(
int64_t stride, int64_t count, bool* RESTRICT first_value) RESTRICT {
if (encoding_ == parquet::Encoding::PLAIN) {
return DecodeValues<parquet::Encoding::PLAIN>(stride, count, first_value);
} else {
DCHECK_EQ(encoding_, parquet::Encoding::RLE);
return DecodeValues<parquet::Encoding::RLE>(stride, count, first_value);
}
}
template <parquet::Encoding::type ENCODING>
bool ParquetBoolDecoder::DecodeValues(
int64_t stride, int64_t count, bool* RESTRICT first_value) RESTRICT {
// TODO: we could optimise this further if needed by bypassing 'unpacked_values_'.
StrideWriter<bool> out(first_value, stride);
for (int64_t i = 0; i < count; ++i) {
if (UNLIKELY(!DecodeValue<ENCODING>(out.Advance()))) return false;
}
return true;
}
bool ParquetBoolDecoder::SkipValues(int num_values) {
DCHECK_GT(num_values, 0);
int skip_cached = min(num_unpacked_values_ - unpacked_value_idx_, num_values);
unpacked_value_idx_ += skip_cached;
if (skip_cached == num_values) return true;
int num_remaining = num_values - skip_cached;
if (encoding_ == parquet::Encoding::PLAIN) {
int num_to_skip = BitUtil::RoundDownToPowerOf2(num_remaining, 32);
if (num_to_skip > 0) bool_values_.SkipBatch(1, num_to_skip);
num_remaining -= num_to_skip;
if (num_remaining > 0) {
DCHECK_LE(num_remaining, UNPACKED_BUFFER_LEN);
num_unpacked_values_ = bool_values_.UnpackBatch(1, UNPACKED_BUFFER_LEN,
&unpacked_values_[0]);
if (UNLIKELY(num_unpacked_values_ < num_remaining)) return false;
unpacked_value_idx_ = num_remaining;
}
return true;
} else {
// rle_decoder_.SkipValues() might fill its internal buffer 'literal_buffer_'.
// This can result in sub-optimal decoding later, because 'literal_buffer_' might
// be used again and again, especially when reading a very long literal run.
DCHECK_EQ(encoding_, parquet::Encoding::RLE);
return rle_decoder_.SkipValues(num_remaining) == num_remaining;
}
}
} // namespace impala