c++/src/RLEv2.hh - orc - Git at Google

 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef ORC_RLEV2_HH
 #define ORC_RLEV2_HH

 #include "Adaptor.hh"
 #include "orc/Exceptions.hh"
 #include "RLE.hh"

 #include <vector>

 #define MIN_REPEAT 3
 #define HIST_LEN 32
 namespace orc {

 struct FixedBitSizes {
     enum FBS {
         ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
         THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
         TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
         TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
     };
 };

 enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };

 struct EncodingOption {
   EncodingType encoding;
   int64_t fixedDelta;
   int64_t gapVsPatchListCount;
   int64_t zigzagLiteralsCount;
   int64_t baseRedLiteralsCount;
   int64_t adjDeltasCount;
   uint32_t zzBits90p;
   uint32_t zzBits100p;
   uint32_t brBits95p;
   uint32_t brBits100p;
   uint32_t bitsDeltaMax;
   uint32_t patchWidth;
   uint32_t patchGapWidth;
   uint32_t patchLength;
   int64_t min;
   bool isFixedDelta;
 };

 class RleEncoderV2 : public RleEncoder {
 public:
     RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);

     ~RleEncoderV2() override {
       delete [] literals;
       delete [] gapVsPatchList;
       delete [] zigzagLiterals;
       delete [] baseRedLiterals;
       delete [] adjDeltas;
     }
     /**
      * Flushing underlying BufferedOutputStream
      */
     uint64_t flush() override;

     void write(int64_t val) override;

 private:

     const bool alignedBitPacking;
     uint32_t fixedRunLength;
     uint32_t variableRunLength;
     int64_t prevDelta;
     int32_t histgram[HIST_LEN];

     // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val),
     // it is move here for performance consideration.
     int64_t* gapVsPatchList;
     int64_t*  zigzagLiterals;
     int64_t*  baseRedLiterals;
     int64_t*  adjDeltas;

     uint32_t getOpCode(EncodingType encoding);
     void determineEncoding(EncodingOption& option);
     void computeZigZagLiterals(EncodingOption& option);
     void preparePatchedBlob(EncodingOption& option);

     void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
     void initializeLiterals(int64_t val);
     void writeValues(EncodingOption& option);
     void writeShortRepeatValues(EncodingOption& option);
     void writeDirectValues(EncodingOption& option);
     void writePatchedBasedValues(EncodingOption& option);
     void writeDeltaValues(EncodingOption& option);
     uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false);
 };

 class RleDecoderV2 : public RleDecoder {
 public:
   RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
                bool isSigned, MemoryPool& pool);

   /**
   * Seek to a particular spot.
   */
   void seek(PositionProvider&) override;

   /**
   * Seek over a given number of values.
   */
   void skip(uint64_t numValues) override;

   /**
   * Read a number of values into the batch.
   */
   void next(int64_t* data, uint64_t numValues,
             const char* notNull) override;

 private:

   // Used by PATCHED_BASE
   void adjustGapAndPatch() {
     curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
       patchBitSize;
     curPatch = unpackedPatch[patchIdx] & patchMask;
     actualGap = 0;

     // special case: gap is >255 then patch value will be 0.
     // if gap is <=255 then patch value cannot be 0
     while (curGap == 255 && curPatch == 0) {
       actualGap += 255;
       ++patchIdx;
       curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
         patchBitSize;
       curPatch = unpackedPatch[patchIdx] & patchMask;
     }
     // add the left over gap
     actualGap += curGap;
   }

   void resetReadLongs() {
     bitsLeft = 0;
     curByte = 0;
   }

   void resetRun() {
     resetReadLongs();
     bitSize = 0;
   }

   unsigned char readByte() {
   if (bufferStart == bufferEnd) {
     int bufferLength;
     const void* bufferPointer;
     if (!inputStream->Next(&bufferPointer, &bufferLength)) {
       throw ParseError("bad read in RleDecoderV2::readByte");
     }
     bufferStart = static_cast<const char*>(bufferPointer);
     bufferEnd = bufferStart + bufferLength;
   }

   unsigned char result = static_cast<unsigned char>(*bufferStart++);
   return result;
 }

   int64_t readLongBE(uint64_t bsz);
   int64_t readVslong();
   uint64_t readVulong();
   uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
                      uint64_t fb, const char* notNull = nullptr) {
   uint64_t ret = 0;

   // TODO: unroll to improve performance
   for(uint64_t i = offset; i < (offset + len); i++) {
     // skip null positions
     if (notNull && !notNull[i]) {
       continue;
     }
     uint64_t result = 0;
     uint64_t bitsLeftToRead = fb;
     while (bitsLeftToRead > bitsLeft) {
       result <<= bitsLeft;
       result |= curByte & ((1 << bitsLeft) - 1);
       bitsLeftToRead -= bitsLeft;
       curByte = readByte();
       bitsLeft = 8;
     }

     // handle the left over bits
     if (bitsLeftToRead > 0) {
       result <<= bitsLeftToRead;
       bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
       result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
     }
     data[i] = static_cast<int64_t>(result);
     ++ret;
   }

   return ret;
 }

   uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
                             const char* notNull);
   uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
                       const char* notNull);
   uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
                        const char* notNull);
   uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
                      const char* notNull);

   const std::unique_ptr<SeekableInputStream> inputStream;
   const bool isSigned;

   unsigned char firstByte;
   uint64_t runLength;
   uint64_t runRead;
   const char *bufferStart;
   const char *bufferEnd;
   int64_t deltaBase; // Used by DELTA
   uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
   int64_t firstValue; // Used by SHORT_REPEAT and DELTA
   int64_t prevValue; // Used by DELTA
   uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
   uint32_t bitsLeft; // Used by anything that uses readLongs
   uint32_t curByte; // Used by anything that uses readLongs
   uint32_t patchBitSize; // Used by PATCHED_BASE
   uint64_t unpackedIdx; // Used by PATCHED_BASE
   uint64_t patchIdx; // Used by PATCHED_BASE
   int64_t base; // Used by PATCHED_BASE
   uint64_t curGap; // Used by PATCHED_BASE
   int64_t curPatch; // Used by PATCHED_BASE
   int64_t patchMask; // Used by PATCHED_BASE
   int64_t actualGap; // Used by PATCHED_BASE
   DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
   DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
 };
 }  // namespace orc

 #endif  // ORC_RLEV2_HH
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef ORC_RLEV2_HH
	#define ORC_RLEV2_HH

	#include "Adaptor.hh"
	#include "orc/Exceptions.hh"
	#include "RLE.hh"

	#include <vector>

	#define MIN_REPEAT 3
	#define HIST_LEN 32
	namespace orc {

	struct FixedBitSizes {
	enum FBS {
	ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
	THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
	TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
	TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
	};
	};

	enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };

	struct EncodingOption {
	EncodingType encoding;
	int64_t fixedDelta;
	int64_t gapVsPatchListCount;
	int64_t zigzagLiteralsCount;
	int64_t baseRedLiteralsCount;
	int64_t adjDeltasCount;
	uint32_t zzBits90p;
	uint32_t zzBits100p;
	uint32_t brBits95p;
	uint32_t brBits100p;
	uint32_t bitsDeltaMax;
	uint32_t patchWidth;
	uint32_t patchGapWidth;
	uint32_t patchLength;
	int64_t min;
	bool isFixedDelta;
	};

	class RleEncoderV2 : public RleEncoder {
	public:
	RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);

	~RleEncoderV2() override {
	delete [] literals;
	delete [] gapVsPatchList;
	delete [] zigzagLiterals;
	delete [] baseRedLiterals;
	delete [] adjDeltas;
	}
	/**
	* Flushing underlying BufferedOutputStream
	*/
	uint64_t flush() override;

	void write(int64_t val) override;

	private:

	const bool alignedBitPacking;
	uint32_t fixedRunLength;
	uint32_t variableRunLength;
	int64_t prevDelta;
	int32_t histgram[HIST_LEN];

	// The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val),
	// it is move here for performance consideration.
	int64_t* gapVsPatchList;
	int64_t* zigzagLiterals;
	int64_t* baseRedLiterals;
	int64_t* adjDeltas;

	uint32_t getOpCode(EncodingType encoding);
	void determineEncoding(EncodingOption& option);
	void computeZigZagLiterals(EncodingOption& option);
	void preparePatchedBlob(EncodingOption& option);

	void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize);
	void initializeLiterals(int64_t val);
	void writeValues(EncodingOption& option);
	void writeShortRepeatValues(EncodingOption& option);
	void writeDirectValues(EncodingOption& option);
	void writePatchedBasedValues(EncodingOption& option);
	void writeDeltaValues(EncodingOption& option);
	uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false);
	};

	class RleDecoderV2 : public RleDecoder {
	public:
	RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
	bool isSigned, MemoryPool& pool);

	/**
	* Seek to a particular spot.
	*/
	void seek(PositionProvider&) override;

	/**
	* Seek over a given number of values.
	*/
	void skip(uint64_t numValues) override;

	/**
	* Read a number of values into the batch.
	*/
	void next(int64_t* data, uint64_t numValues,
	const char* notNull) override;

	private:

	// Used by PATCHED_BASE
	void adjustGapAndPatch() {
	curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
	patchBitSize;
	curPatch = unpackedPatch[patchIdx] & patchMask;
	actualGap = 0;

	// special case: gap is >255 then patch value will be 0.
	// if gap is <=255 then patch value cannot be 0
	while (curGap == 255 && curPatch == 0) {
	actualGap += 255;
	++patchIdx;
	curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
	patchBitSize;
	curPatch = unpackedPatch[patchIdx] & patchMask;
	}
	// add the left over gap
	actualGap += curGap;
	}

	void resetReadLongs() {
	bitsLeft = 0;
	curByte = 0;
	}

	void resetRun() {
	resetReadLongs();
	bitSize = 0;
	}

	unsigned char readByte() {
	if (bufferStart == bufferEnd) {
	int bufferLength;
	const void* bufferPointer;
	if (!inputStream->Next(&bufferPointer, &bufferLength)) {
	throw ParseError("bad read in RleDecoderV2::readByte");
	}
	bufferStart = static_cast<const char*>(bufferPointer);
	bufferEnd = bufferStart + bufferLength;
	}

	unsigned char result = static_cast<unsigned char>(*bufferStart++);
	return result;
	}

	int64_t readLongBE(uint64_t bsz);
	int64_t readVslong();
	uint64_t readVulong();
	uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
	uint64_t fb, const char* notNull = nullptr) {
	uint64_t ret = 0;

	// TODO: unroll to improve performance
	for(uint64_t i = offset; i < (offset + len); i++) {
	// skip null positions
	if (notNull && !notNull[i]) {
	continue;
	}
	uint64_t result = 0;
	uint64_t bitsLeftToRead = fb;
	while (bitsLeftToRead > bitsLeft) {
	result <<= bitsLeft;
	result \|= curByte & ((1 << bitsLeft) - 1);
	bitsLeftToRead -= bitsLeft;
	curByte = readByte();
	bitsLeft = 8;
	}

	// handle the left over bits
	if (bitsLeftToRead > 0) {
	result <<= bitsLeftToRead;
	bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
	result \|= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
	}
	data[i] = static_cast<int64_t>(result);
	++ret;
	}

	return ret;
	}

	uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
	const char* notNull);
	uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
	const char* notNull);
	uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
	const char* notNull);
	uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
	const char* notNull);

	const std::unique_ptr<SeekableInputStream> inputStream;
	const bool isSigned;

	unsigned char firstByte;
	uint64_t runLength;
	uint64_t runRead;
	const char *bufferStart;
	const char *bufferEnd;
	int64_t deltaBase; // Used by DELTA
	uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
	int64_t firstValue; // Used by SHORT_REPEAT and DELTA
	int64_t prevValue; // Used by DELTA
	uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
	uint32_t bitsLeft; // Used by anything that uses readLongs
	uint32_t curByte; // Used by anything that uses readLongs
	uint32_t patchBitSize; // Used by PATCHED_BASE
	uint64_t unpackedIdx; // Used by PATCHED_BASE
	uint64_t patchIdx; // Used by PATCHED_BASE
	int64_t base; // Used by PATCHED_BASE
	uint64_t curGap; // Used by PATCHED_BASE
	int64_t curPatch; // Used by PATCHED_BASE
	int64_t patchMask; // Used by PATCHED_BASE
	int64_t actualGap; // Used by PATCHED_BASE
	DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
	DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
	};
	} // namespace orc

	#endif // ORC_RLEV2_HH