blob: 89f908da9dcbd02b227c1440d0b05a8cbc48fb77 [file]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "paimon/common/utils/delta_varint_compressor.h"
#include <cstdint>
#include <utility>
#include <vector>
#include "paimon/result.h"
#include "paimon/status.h"
namespace paimon {
std::vector<char> DeltaVarintCompressor::Compress(const std::vector<int64_t>& data) {
if (data.empty()) {
return {};
}
// 1. Delta encoding (use unsigned subtraction to avoid signed overflow UB)
std::vector<int64_t> deltas;
deltas.reserve(data.size());
deltas.push_back(data[0]);
for (size_t i = 1; i < data.size(); i++) {
uint64_t unsigned_delta =
static_cast<uint64_t>(data[i]) - static_cast<uint64_t>(data[i - 1]);
deltas.push_back(static_cast<int64_t>(unsigned_delta));
}
// 2. ZigZag + Varint
std::vector<char> out;
out.reserve(data.size() * 10);
for (const auto& delta : deltas) {
EncodeVarint(delta, &out);
}
return out;
}
Result<std::vector<int64_t>> DeltaVarintCompressor::Decompress(const std::vector<char>& bytes) {
if (bytes.empty()) {
return std::vector<int64_t>();
}
// 1. Decode ZigZag + Varint → delta
std::vector<int64_t> deltas;
deltas.reserve(bytes.size());
size_t pos = 0;
while (pos < bytes.size()) {
PAIMON_ASSIGN_OR_RAISE(int64_t delta, DecodeVarint(bytes, &pos));
deltas.push_back(delta);
}
// 2. Delta decoding (use unsigned addition to avoid signed overflow UB)
std::vector<int64_t> result(deltas.size());
result[0] = deltas[0];
for (size_t i = 1; i < result.size(); i++) {
uint64_t reconstructed =
static_cast<uint64_t>(result[i - 1]) + static_cast<uint64_t>(deltas[i]);
result[i] = static_cast<int64_t>(reconstructed);
}
return result;
}
void DeltaVarintCompressor::EncodeVarint(int64_t value, std::vector<char>* out) {
uint64_t tmp = ZigZag(value);
// Check if multiple bytes are needed
while (tmp & ~0x7F) {
// Set MSB to 1 (continuation)
out->push_back(static_cast<char>((tmp & 0x7F) | 0x80));
// Unsigned right shift
tmp >>= 7;
}
// Final byte with MSB set to 0
out->push_back(static_cast<char>(tmp));
}
Result<int64_t> DeltaVarintCompressor::DecodeVarint(const std::vector<char>& in, size_t* pos) {
uint64_t result = 0;
int32_t shift = 0;
while (true) {
if (*pos >= in.size()) {
return Status::Invalid("Unexpected end of input");
}
char byte = in[(*pos)++];
// Extract 7 bits
result |= (static_cast<uint64_t>(byte & 0x7F) << shift);
// MSB is 0, end of encoding
if ((byte & 0x80) == 0) {
break;
}
shift += 7;
if (shift > 63) {
return Status::Invalid("Varint overflow");
}
}
return UnZigZag(result);
}
} // namespace paimon