blob: f48ce8391b368fd8a17694f82f477b6e1d575488 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ORC_RLEV2_HH
#define ORC_RLEV2_HH
#include "Adaptor.hh"
#include "RLE.hh"
#include "orc/Exceptions.hh"
#include <vector>
#define MAX_LITERAL_SIZE 512
#define MIN_REPEAT 3
#define HIST_LEN 32
namespace orc {
struct FixedBitSizes {
enum FBS {
ONE = 0,
TWO,
THREE,
FOUR,
FIVE,
SIX,
SEVEN,
EIGHT,
NINE,
TEN,
ELEVEN,
TWELVE,
THIRTEEN,
FOURTEEN,
FIFTEEN,
SIXTEEN,
SEVENTEEN,
EIGHTEEN,
NINETEEN,
TWENTY,
TWENTYONE,
TWENTYTWO,
TWENTYTHREE,
TWENTYFOUR,
TWENTYSIX,
TWENTYEIGHT,
THIRTY,
THIRTYTWO,
FORTY,
FORTYEIGHT,
FIFTYSIX,
SIXTYFOUR,
SIZE
};
};
enum EncodingType { SHORT_REPEAT = 0, DIRECT = 1, PATCHED_BASE = 2, DELTA = 3 };
struct EncodingOption {
EncodingType encoding;
int64_t fixedDelta;
int64_t gapVsPatchListCount;
int64_t zigzagLiteralsCount;
int64_t baseRedLiteralsCount;
int64_t adjDeltasCount;
uint32_t zzBits90p;
uint32_t zzBits100p;
uint32_t brBits95p;
uint32_t brBits100p;
uint32_t bitsDeltaMax;
uint32_t patchWidth;
uint32_t patchGapWidth;
uint32_t patchLength;
int64_t min;
bool isFixedDelta;
};
class RleEncoderV2 : public RleEncoder {
public:
RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned,
bool alignBitPacking = true);
~RleEncoderV2() override {
delete[] literals;
delete[] gapVsPatchList;
delete[] zigzagLiterals;
delete[] baseRedLiterals;
delete[] adjDeltas;
}
/**
* Flushing underlying BufferedOutputStream
*/
uint64_t flush() override;
void write(int64_t val) override;
private:
const bool alignedBitPacking;
uint32_t fixedRunLength;
uint32_t variableRunLength;
int64_t prevDelta;
int32_t histgram[HIST_LEN];
// The four list below should actually belong to EncodingOption since it only holds temporal
// values in write(int64_t val), it is move here for performance consideration.
int64_t* gapVsPatchList;
int64_t* zigzagLiterals;
int64_t* baseRedLiterals;
int64_t* adjDeltas;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
void determineEncoding(EncodingOption& option);
void computeZigZagLiterals(EncodingOption& option);
void preparePatchedBlob(EncodingOption& option);
void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
void initializeLiterals(int64_t val);
void writeValues(EncodingOption& option);
void writeShortRepeatValues(EncodingOption& option);
void writeDirectValues(EncodingOption& option);
void writePatchedBasedValues(EncodingOption& option);
void writeDeltaValues(EncodingOption& option);
uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p,
bool reuseHist = false);
};
class RleDecoderV2 : public RleDecoder {
public:
RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned, MemoryPool& pool,
ReaderMetrics* metrics);
/**
* Seek to a particular spot.
*/
void seek(PositionProvider&) override;
/**
* Seek over a given number of values.
*/
void skip(uint64_t numValues) override;
/**
* Read a number of values into the batch.
*/
template <typename T>
void next(T* data, uint64_t numValues, const char* notNull);
void next(int64_t* data, uint64_t numValues, const char* notNull) override;
void next(int32_t* data, uint64_t numValues, const char* notNull) override;
void next(int16_t* data, uint64_t numValues, const char* notNull) override;
private:
/**
* Decode the next gap and patch from 'unpackedPatch' and update the index on it.
* Used by PATCHED_BASE.
*
* @param patchBitSize bit size of the patch value
* @param patchMask mask for the patch value
* @param resGap result of gap
* @param resPatch result of patch
* @param patchIdx current index in the 'unpackedPatch' buffer
*/
void adjustGapAndPatch(uint32_t patchBitSize, int64_t patchMask, int64_t* resGap,
int64_t* resPatch, uint64_t* patchIdx);
void resetReadLongs() {
bitsLeft = 0;
curByte = 0;
}
void resetRun() {
resetReadLongs();
}
unsigned char readByte();
int64_t readLongBE(uint64_t bsz);
int64_t readVslong();
uint64_t readVulong();
void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs);
void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len);
void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len);
template <typename T>
uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextDirect(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextPatched(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t nextDelta(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
template <typename T>
uint64_t copyDataFromBuffer(T* data, uint64_t offset, uint64_t numValues, const char* notNull);
const std::unique_ptr<SeekableInputStream> inputStream;
const bool isSigned;
unsigned char firstByte;
uint64_t runLength; // Length of the current run
uint64_t runRead; // Number of returned values of the current run
const char* bufferStart;
const char* bufferEnd;
uint32_t bitsLeft; // Used by readLongs when bitSize < 8
uint32_t curByte; // Used by anything that uses readLongs
DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
DataBuffer<int64_t> literals; // Values of the current run
};
} // namespace orc
#endif // ORC_RLEV2_HH