blob: a8e0340e7e78c7cbc8fac877a6545ac7ff1fa5d2 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ORC_RLEV2_HH
#define ORC_RLEV2_HH
#include "Adaptor.hh"
#include "RLE.hh"
#include "orc/Exceptions.hh"
#include <vector>
#define MAX_LITERAL_SIZE 512
#define MIN_REPEAT 3
#define HIST_LEN 32
namespace orc {
struct FixedBitSizes {
enum FBS {
ONE = 0,
TWO,
THREE,
FOUR,
FIVE,
SIX,
SEVEN,
EIGHT,
NINE,
TEN,
ELEVEN,
TWELVE,
THIRTEEN,
FOURTEEN,
FIFTEEN,
SIXTEEN,
SEVENTEEN,
EIGHTEEN,
NINETEEN,
TWENTY,
TWENTYONE,
TWENTYTWO,
TWENTYTHREE,
TWENTYFOUR,
TWENTYSIX,
TWENTYEIGHT,
THIRTY,
THIRTYTWO,
FORTY,
FORTYEIGHT,
FIFTYSIX,
SIXTYFOUR,
SIZE
};
};
enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 };
struct EncodingOption {
EncodingType encoding;
int64_t fixedDelta;
int64_t gapVsPatchListCount;
int64_t zigzagLiteralsCount;
int64_t baseRedLiteralsCount;
int64_t adjDeltasCount;
uint32_t zzBits90p;
uint32_t zzBits100p;
uint32_t brBits95p;
uint32_t brBits100p;
uint32_t bitsDeltaMax;
uint32_t patchWidth;
uint32_t patchGapWidth;
uint32_t patchLength;
int64_t min;
bool isFixedDelta;
};
class RleEncoderV2 : public RleEncoder {
public:
RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned,
bool alignBitPacking = true);
~RleEncoderV2() override {
delete[] literals;
delete[] gapVsPatchList_;
delete[] zigzagLiterals_;
delete[] baseRedLiterals_;
delete[] adjDeltas_;
}
/**
* Flushing underlying BufferedOutputStream
*/
uint64_t flush() override;
void write(int64_t val) override;
private:
const bool alignedBitPacking_;
uint32_t fixedRunLength_;
uint32_t variableRunLength_;
int64_t prevDelta_;
int32_t histgram_[HIST_LEN];
// The four list below should actually belong to EncodingOption since it only holds temporal
// values in write(int64_t val), it is move here for performance consideration.
int64_t* gapVsPatchList_;
int64_t* zigzagLiterals_;
int64_t* baseRedLiterals_;
int64_t* adjDeltas_;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
void determineEncoding(EncodingOption& option);
void computeZigZagLiterals(EncodingOption& option);
void preparePatchedBlob(EncodingOption& option);
void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
void initializeLiterals(int64_t val);
void writeValues(EncodingOption& option);
void writeShortRepeatValues(EncodingOption& option);
void writeDirectValues(EncodingOption& option);
void writePatchedBasedValues(EncodingOption& option);
void writeDeltaValues(EncodingOption& option);
uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p,
bool reuseHist = false);
};
class RleDecoderV2 : public RleDecoder {
public:
RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, MemoryPool& pool,
ReaderMetrics* metrics);
/**
* Seek to a particular spot.
*/
void seek(PositionProvider&) override;
/**
* Seek over a given number of values.
*/
void skip(uint64_t numValues) override;
/**
* Read a number of values into the batch.
*/
template <typename T>
void next(T* data, uint64_t numValues, const char* notNull);
void next(int64_t* data, uint64_t numValues, const char* notNull) override;
void next(int32_t* data, uint64_t numValues, const char* notNull) override;
void next(int16_t* data, uint64_t numValues, const char* notNull) override;
unsigned char readByte();
void setBufStart(const char* start) {
bufferStart_ = const_cast<char*>(start);
}
char* getBufStart() {
return bufferStart_;
}
void setBufEnd(const char* end) {
bufferEnd_ = const_cast<char*>(end);
}
char* getBufEnd() {
return bufferEnd_;
}
uint64_t bufLength() {
return bufferEnd_ - bufferStart_;
}
void setBitsLeft(const uint32_t bits) {
bitsLeft_ = bits;
}
void setCurByte(const uint32_t byte) {
curByte_ = byte;
}
uint32_t getBitsLeft() {
return bitsLeft_;
}
uint32_t getCurByte() {
return curByte_;
}
/**
* Most hotspot of this function locates in saving stack, so inline this function to have
* performance gain.
*/
inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen);
private:
/**
* Decode the next gap and patch from 'unpackedPatch' and update the index on it.
* Used by PATCHED_BASE.
*
* @param patchBitSize bit size of the patch value
* @param patchMask mask for the patch value
* @param resGap result of gap
* @param resPatch result of patch
* @param patchIdx current index in the 'unpackedPatch' buffer
*/
void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap,
int64_t* resPatch, uint64_t* patchIdx);
void resetReadLongs() {
bitsLeft_ = 0;
curByte_ = 0;
}
void resetRun() {
resetReadLongs();
}
int64_t readLongBE(uint64_t bsz);
int64_t readVslong();
uint64_t readVulong();
void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
template <typename T>
uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
const std::unique_ptr<SeekableInputStream> inputStream_;
const bool isSigned_;
unsigned char firstByte_;
char* bufferStart_;
char* bufferEnd_;
uint64_t runLength_; // Length of the current run
uint64_t runRead_; // Number of returned values of the current run
uint32_t bitsLeft_; // Used by readLongs when bitSize < 8
uint32_t curByte_; // Used by anything that uses readLongs
DataBuffer<int64_t> unpackedPatch_; // Used by PATCHED_BASE
DataBuffer<int64_t> literals_; // Values of the current run
};
inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) {
uint64_t remainingLen = bufLength();
int bufferLength = 0;
const void* bufferPointer = nullptr;
if (backupByteLen != 0) {
inputStream_->BackUp(backupByteLen);
}
if (len >= remainingLen && resetBuf) {
if (!inputStream_->Next(&bufferPointer, &bufferLength)) {
throw ParseError("bad read in RleDecoderV2::resetBufferStart");
}
}
if (bufferPointer == nullptr) {
bufferStart_ += len;
} else {
bufferStart_ = const_cast<char*>(static_cast<const char*>(bufferPointer));
bufferEnd_ = bufferStart_ + bufferLength;
}
}
} // namespace orc
#endif // ORC_RLEV2_HH