| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef ORC_RLEV2_HH |
| #define ORC_RLEV2_HH |
| |
| #include "Adaptor.hh" |
| #include "RLE.hh" |
| #include "orc/Exceptions.hh" |
| |
| #include <vector> |
| |
| #define MAX_LITERAL_SIZE 512 |
| #define MIN_REPEAT 3 |
| #define HIST_LEN 32 |
| namespace orc { |
| |
| struct FixedBitSizes { |
| enum FBS { |
| ONE = 0, |
| TWO, |
| THREE, |
| FOUR, |
| FIVE, |
| SIX, |
| SEVEN, |
| EIGHT, |
| NINE, |
| TEN, |
| ELEVEN, |
| TWELVE, |
| THIRTEEN, |
| FOURTEEN, |
| FIFTEEN, |
| SIXTEEN, |
| SEVENTEEN, |
| EIGHTEEN, |
| NINETEEN, |
| TWENTY, |
| TWENTYONE, |
| TWENTYTWO, |
| TWENTYTHREE, |
| TWENTYFOUR, |
| TWENTYSIX, |
| TWENTYEIGHT, |
| THIRTY, |
| THIRTYTWO, |
| FORTY, |
| FORTYEIGHT, |
| FIFTYSIX, |
| SIXTYFOUR, |
| SIZE |
| }; |
| }; |
| |
| enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 }; |
| |
| struct EncodingOption { |
| EncodingType encoding; |
| int64_t fixedDelta; |
| int64_t gapVsPatchListCount; |
| int64_t zigzagLiteralsCount; |
| int64_t baseRedLiteralsCount; |
| int64_t adjDeltasCount; |
| uint32_t zzBits90p; |
| uint32_t zzBits100p; |
| uint32_t brBits95p; |
| uint32_t brBits100p; |
| uint32_t bitsDeltaMax; |
| uint32_t patchWidth; |
| uint32_t patchGapWidth; |
| uint32_t patchLength; |
| int64_t min; |
| bool isFixedDelta; |
| }; |
| |
| class RleEncoderV2 : public RleEncoder { |
| public: |
| RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, |
| bool alignBitPacking = true); |
| |
| ~RleEncoderV2() override { |
| delete[] literals; |
| delete[] gapVsPatchList_; |
| delete[] zigzagLiterals_; |
| delete[] baseRedLiterals_; |
| delete[] adjDeltas_; |
| } |
| /** |
| * Flushing underlying BufferedOutputStream |
| */ |
| uint64_t flush() override; |
| |
| void write(int64_t val) override; |
| |
| private: |
| const bool alignedBitPacking_; |
| uint32_t fixedRunLength_; |
| uint32_t variableRunLength_; |
| int64_t prevDelta_; |
| int32_t histgram_[HIST_LEN]; |
| |
| // The four list below should actually belong to EncodingOption since it only holds temporal |
| // values in write(int64_t val), it is move here for performance consideration. |
| int64_t* gapVsPatchList_; |
| int64_t* zigzagLiterals_; |
| int64_t* baseRedLiterals_; |
| int64_t* adjDeltas_; |
| |
| uint32_t getOpCode(EncodingType encoding); |
| int64_t* prepareForDirectOrPatchedBase(EncodingOption& option); |
| void determineEncoding(EncodingOption& option); |
| void computeZigZagLiterals(EncodingOption& option); |
| void preparePatchedBlob(EncodingOption& option); |
| |
| void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize); |
| void initializeLiterals(int64_t val); |
| void writeValues(EncodingOption& option); |
| void writeShortRepeatValues(EncodingOption& option); |
| void writeDirectValues(EncodingOption& option); |
| void writePatchedBasedValues(EncodingOption& option); |
| void writeDeltaValues(EncodingOption& option); |
| uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, |
| bool reuseHist = false); |
| }; |
| |
| class RleDecoderV2 : public RleDecoder { |
| public: |
| RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, MemoryPool& pool, |
| ReaderMetrics* metrics); |
| |
| /** |
| * Seek to a particular spot. |
| */ |
| void seek(PositionProvider&) override; |
| |
| /** |
| * Seek over a given number of values. |
| */ |
| void skip(uint64_t numValues) override; |
| |
| /** |
| * Read a number of values into the batch. |
| */ |
| template <typename T> |
| void next(T* data, uint64_t numValues, const char* notNull); |
| |
| void next(int64_t* data, uint64_t numValues, const char* notNull) override; |
| |
| void next(int32_t* data, uint64_t numValues, const char* notNull) override; |
| |
| void next(int16_t* data, uint64_t numValues, const char* notNull) override; |
| |
| unsigned char readByte(); |
| |
| void setBufStart(const char* start) { |
| bufferStart_ = const_cast<char*>(start); |
| } |
| |
| char* getBufStart() { |
| return bufferStart_; |
| } |
| |
| void setBufEnd(const char* end) { |
| bufferEnd_ = const_cast<char*>(end); |
| } |
| |
| char* getBufEnd() { |
| return bufferEnd_; |
| } |
| |
| uint64_t bufLength() { |
| return bufferEnd_ - bufferStart_; |
| } |
| |
| void setBitsLeft(const uint32_t bits) { |
| bitsLeft_ = bits; |
| } |
| |
| void setCurByte(const uint32_t byte) { |
| curByte_ = byte; |
| } |
| |
| uint32_t getBitsLeft() { |
| return bitsLeft_; |
| } |
| |
| uint32_t getCurByte() { |
| return curByte_; |
| } |
| |
| /** |
| * Most hotspot of this function locates in saving stack, so inline this function to have |
| * performance gain. |
| */ |
| inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); |
| |
| private: |
| /** |
| * Decode the next gap and patch from 'unpackedPatch' and update the index on it. |
| * Used by PATCHED_BASE. |
| * |
| * @param patchBitSize bit size of the patch value |
| * @param patchMask mask for the patch value |
| * @param resGap result of gap |
| * @param resPatch result of patch |
| * @param patchIdx current index in the 'unpackedPatch' buffer |
| */ |
| void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap, |
| int64_t* resPatch, uint64_t* patchIdx); |
| |
| void resetReadLongs() { |
| bitsLeft_ = 0; |
| curByte_ = 0; |
| } |
| |
| void resetRun() { |
| resetReadLongs(); |
| } |
| |
| int64_t readLongBE(uint64_t bsz); |
| int64_t readVslong(); |
| uint64_t readVulong(); |
| void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); |
| |
| template <typename T> |
| uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); |
| template <typename T> |
| uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull); |
| template <typename T> |
| uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull); |
| template <typename T> |
| uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull); |
| template <typename T> |
| uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull); |
| |
| const std::unique_ptr<SeekableInputStream> inputStream_; |
| const bool isSigned_; |
| unsigned char firstByte_; |
| char* bufferStart_; |
| char* bufferEnd_; |
| uint64_t runLength_; // Length of the current run |
| uint64_t runRead_; // Number of returned values of the current run |
| uint32_t bitsLeft_; // Used by readLongs when bitSize < 8 |
| uint32_t curByte_; // Used by anything that uses readLongs |
| DataBuffer<int64_t> unpackedPatch_; // Used by PATCHED_BASE |
| DataBuffer<int64_t> literals_; // Values of the current run |
| }; |
| |
| inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { |
| uint64_t remainingLen = bufLength(); |
| int bufferLength = 0; |
| const void* bufferPointer = nullptr; |
| |
| if (backupByteLen != 0) { |
| inputStream_->BackUp(backupByteLen); |
| } |
| |
| if (len >= remainingLen && resetBuf) { |
| if (!inputStream_->Next(&bufferPointer, &bufferLength)) { |
| throw ParseError("bad read in RleDecoderV2::resetBufferStart"); |
| } |
| } |
| |
| if (bufferPointer == nullptr) { |
| bufferStart_ += len; |
| } else { |
| bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer)); |
| bufferEnd_ = bufferStart_ + bufferLength; |
| } |
| } |
| } // namespace orc |
| |
| #endif // ORC_RLEV2_HH |