| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #ifndef DORIS_FRAME_OF_REFERENCE_CODING_H |
| #define DORIS_FRAME_OF_REFERENCE_CODING_H |
| |
| |
| #include <cstdlib> |
| #include <iostream> |
| #include <limits> |
| |
| #include "olap/olap_common.h" |
| #include "olap/uint24.h" |
| |
| #include "util/bit_stream_utils.h" |
| #include "util/bit_stream_utils.inline.h" |
| #include "util/faststring.h" |
| |
| namespace doris { |
| |
| static inline uint8_t bits_less_than_64(const uint64_t v) { |
| return v == 0 ? 0 : 64 - __builtin_clzll(v); |
| } |
| |
| // See https://stackoverflow.com/questions/28423405/counting-the-number-of-leading-zeros-in-a-128-bit-integer |
| static inline uint8_t bits_may_more_than_64(const uint128_t v) { |
| uint64_t hi = v >> 64; |
| uint64_t lo = v; |
| int z[3] = { |
| __builtin_clzll(hi), |
| __builtin_clzll(lo) + 64, |
| 128 |
| }; |
| int idx = !hi + ((!lo) & (!hi)); |
| return 128 - z[idx]; |
| } |
| |
| template <typename T> |
| static inline uint8_t bits(const T v) { |
| if (sizeof(T) <= 8) { |
| return bits_less_than_64(v); |
| } else { |
| return bits_may_more_than_64(v); |
| } |
| } |
| |
| // The implementation for frame-of-reference coding |
| // The detail of frame-of-reference coding, please refer to |
| // https://lemire.me/blog/2012/02/08/effective-compression-using-frame-of-reference-and-delta-coding/ |
| // and https://www.elastic.co/cn/blog/frame-of-reference-and-roaring-bitmaps |
| // |
| // The encoded data format is as follows: |
| // |
| // 1. Body: |
| // BitPackingFrame * FrameCount |
| // 2. Footer: |
| // |
| // (8 bit StorageFormat + 8 bit BitWidth) * FrameCount |
| // 8 bit FrameValueNum |
| // 32 bit ValuesNum |
| // |
| // There are currently three storage formats |
| // (1) if the StorageFormat == 0: When input data order is not ascending and the BitPackingFrame format is: |
| // MinValue, (Value[i] - MinVale) * FrameValueNum |
| // |
| // (2) if the StorageFormat == 1: When input data order is ascending and the BitPackingFrame format is: |
| // MinValue, (Value[i] - Value[i - 1]) * FrameValueNum |
| // |
| // (3) if the StorageFormat == 2: When overflow occurs when using (1) or (2) and save original values: |
| // MinValue, (Value[i]) * FrameValueNum |
| // |
| // len(MinValue) can be 32(uint32_t), 64(uint64_t), 128(uint128_t) |
| // |
| // The OrderFlag is 1 represents ascending order, 0 represents not ascending order |
| // The last frame value num maybe less than 128 |
| template<typename T> |
| class ForEncoder { |
| public: |
| explicit ForEncoder(faststring* buffer): _buffer(buffer) {} |
| |
| void put(const T value) { |
| return put_batch(&value, 1); |
| } |
| |
| void put_batch(const T* value, size_t count); |
| |
| // Flushes any pending values to the underlying buffer. |
| // Returns the total number of bytes written |
| uint32_t flush(); |
| |
| // underlying buffer size + footer meta size. |
| // Note: should call this method before flush. |
| uint32_t len() { |
| return _buffer->size() + _storage_formats.size() + _bit_widths.size() + 5; |
| } |
| |
| // Resets all the state in the encoder. |
| void clear() { |
| _values_num = 0; |
| _buffered_values_num = 0; |
| _buffer->clear(); |
| } |
| |
| private: |
| void bit_pack(const T *input, uint8_t in_num, int bit_width, uint8_t *output); |
| |
| void bit_packing_one_frame_value(const T* input); |
| |
| const T* copy_value(const T* val, size_t count); |
| |
| const T numeric_limits_max(); |
| |
| uint32_t _values_num = 0; |
| uint8_t _buffered_values_num = 0; |
| static const uint8_t FRAME_VALUE_NUM = 128; |
| T _buffered_values[FRAME_VALUE_NUM]; |
| |
| faststring* _buffer; |
| std::vector<uint8_t> _storage_formats; |
| std::vector<uint8_t> _bit_widths; |
| }; |
| |
| template<typename T> |
| class ForDecoder { |
| public: |
| explicit ForDecoder(const uint8_t* in_buffer, size_t buffer_len) |
| :_buffer (in_buffer), |
| _buffer_len (buffer_len), |
| _parsed(false){} |
| |
| // read footer metadata |
| bool init(); |
| |
| bool get(T* val) { |
| return get_batch(val, 1); |
| } |
| |
| // Gets the next batch value. Returns false if there are no more. |
| bool get_batch(T* val, size_t count); |
| |
| // The skip_num is positive means move forwards |
| // The skip_num is negative means move backwards |
| bool skip(int32_t skip_num); |
| |
| bool seek_at_or_after_value(const void* value, bool* exact_match); |
| |
| uint32_t current_index() const { |
| return _current_index; |
| } |
| |
| uint32_t count() const { |
| return _values_num; |
| } |
| |
| private: |
| |
| void bit_unpack(const uint8_t *input, uint8_t in_num, int bit_width, T *output); |
| |
| inline uint32_t frame_size(uint32_t frame_index) { |
| return (frame_index == _frame_count - 1) ? _last_frame_size : _max_frame_size; |
| } |
| |
| void decode_current_frame(T* output); |
| |
| T decode_frame_min_value(uint32_t frame_index); |
| |
| // Return index of the last frame which contains value < target. |
| // Return `_frame_count - 1` when all frames are < target. |
| // Return `_frame_count` when not found (all frames are >= target). |
| uint32_t seek_last_frame_before_value(T target); |
| |
| // Seek to the first value in frame that >= target. |
| // Return true when found and update exact_match. |
| // Return false otherwise. |
| bool seek_lower_bound_inside_frame(uint32_t frame_index, T target, bool* exact_match); |
| |
| T* copy_value(T* val, size_t count); |
| |
| const uint8_t* _buffer = nullptr; |
| size_t _buffer_len = 0; |
| bool _parsed = false; |
| |
| uint8_t _max_frame_size = 0; |
| uint8_t _last_frame_size = 0; |
| uint32_t _values_num = 0; |
| uint32_t _frame_count = 0; |
| std::vector<uint32_t> _frame_offsets; |
| std::vector<uint8_t> _bit_widths; |
| std::vector<uint8_t> _storage_formats; |
| |
| uint32_t _current_index = 0; |
| uint32_t _current_decoded_frame = -1; |
| std::vector<T> _out_buffer; // store values of decoded frame |
| }; |
| } |
| |
| |
| #endif //DORIS_FRAME_OF_REFERENCE_CODING_H |