hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/io/compress/lz4/lz4hc_encoder.h - hadoop-common - Git at Google

 /*
    LZ4 HC Encoder - Part of LZ4 HC algorithm
    Copyright (C) 2011-2013, Yann Collet.
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:

        * Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following disclaimer
    in the documentation and/or other materials provided with the
    distribution.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    You can contact the author at :
    - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
    - LZ4 source repository : http://code.google.com/p/lz4/
 */

 /* lz4hc_encoder.h must be included into lz4hc.c
    The objective of this file is to create a single LZ4 compression function source
    which will be instanciated multiple times with minor variations
    depending on a set of #define.
 */

 //****************************
 // Check required defines
 //****************************

 #ifndef FUNCTION_NAME
 #  error "FUNTION_NAME is not defined"
 #endif


 //****************************
 // Local definitions
 //****************************
 #define COMBINED_NAME_RAW(n1,n2) n1 ## n2
 #define COMBINED_NAME(n1,n2) COMBINED_NAME_RAW(n1,n2)
 #define ENCODE_SEQUENCE_NAME COMBINED_NAME(FUNCTION_NAME,_encodeSequence)
 #ifdef LIMITED_OUTPUT
 #  define ENCODE_SEQUENCE(i,o,a,m,r,d) if (ENCODE_SEQUENCE_NAME(i,o,a,m,r,d)) return 0;
 #else
 #  define ENCODE_SEQUENCE(i,o,a,m,r,d) ENCODE_SEQUENCE_NAME(i,o,a,m,r)
 #endif

 //****************************
 // Function code
 //****************************

 forceinline int ENCODE_SEQUENCE_NAME (
                        const BYTE** ip,
                        BYTE** op,
                        const BYTE** anchor,
                        int matchLength,
                        const BYTE* ref
 #ifdef LIMITED_OUTPUT
                       ,BYTE* oend
 #endif
                        )
 {
     int length, len;
     BYTE* token;

     // Encode Literal length
     length = (int)(*ip - *anchor);
     token = (*op)++;
 #ifdef LIMITED_OUTPUT
     if ((*op + length + (2 + 1 + LASTLITERALS) + (length>>8)) > oend) return 1; 		// Check output limit
 #endif
     if (length>=(int)RUN_MASK) { *token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) *(*op)++ = 255;  *(*op)++ = (BYTE)len; }
     else *token = (BYTE)(length<<ML_BITS);

     // Copy Literals
     LZ4_BLINDCOPY(*anchor, *op, length);

     // Encode Offset
     LZ4_WRITE_LITTLEENDIAN_16(*op,(U16)(*ip-ref));

     // Encode MatchLength
     length = (int)(matchLength-MINMATCH);
 #ifdef LIMITED_OUTPUT
     if (*op + (1 + LASTLITERALS) + (length>>8) > oend) return 1; 		// Check output limit
 #endif
     if (length>=(int)ML_MASK) { *token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { *(*op)++ = 255; *(*op)++ = 255; } if (length > 254) { length-=255; *(*op)++ = 255; } *(*op)++ = (BYTE)length; }
     else *token += (BYTE)(length);

     // Prepare next loop
     *ip += matchLength;
     *anchor = *ip;

     return 0;
 }


 int COMBINED_NAME(FUNCTION_NAME,_continue) (
                  void* ctxvoid,
                  const char* source,
                  char* dest,
                  int inputSize
 #ifdef LIMITED_OUTPUT
                 ,int maxOutputSize
 #endif
                 )
 {
     LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid;
     const BYTE* ip = (const BYTE*) source;
     const BYTE* anchor = ip;
     const BYTE* const iend = ip + inputSize;
     const BYTE* const mflimit = iend - MFLIMIT;
     const BYTE* const matchlimit = (iend - LASTLITERALS);

     BYTE* op = (BYTE*) dest;
 #ifdef LIMITED_OUTPUT
     BYTE* const oend = op + maxOutputSize;
 #endif

     int	ml, ml2, ml3, ml0;
     const BYTE* ref=NULL;
     const BYTE* start2=NULL;
     const BYTE* ref2=NULL;
     const BYTE* start3=NULL;
     const BYTE* ref3=NULL;
     const BYTE* start0;
     const BYTE* ref0;

     // Ensure blocks follow each other
     if (ip != ctx->end) return 0;
     ctx->end += inputSize;

     ip++;

     // Main Loop
     while (ip < mflimit)
     {
         ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref));
         if (!ml) { ip++; continue; }

         // saved, in case we would skip too much
         start0 = ip;
         ref0 = ref;
         ml0 = ml;

 _Search2:
         if (ip+ml < mflimit)
             ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2);
         else ml2 = ml;

         if (ml2 == ml)  // No better match
         {
             ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
             continue;
         }

         if (start0 < ip)
         {
             if (start2 < ip + ml0)   // empirical
             {
                 ip = start0;
                 ref = ref0;
                 ml = ml0;
             }
         }

         // Here, start0==ip
         if ((start2 - ip) < 3)   // First Match too small : removed
         {
             ml = ml2;
             ip = start2;
             ref =ref2;
             goto _Search2;
         }

 _Search3:
         // Currently we have :
         // ml2 > ml1, and
         // ip1+3 <= ip2 (usually < ip1+ml1)
         if ((start2 - ip) < OPTIMAL_ML)
         {
             int correction;
             int new_ml = ml;
             if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
             if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
             correction = new_ml - (int)(start2 - ip);
             if (correction > 0)
             {
                 start2 += correction;
                 ref2 += correction;
                 ml2 -= correction;
             }
         }
         // Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18)

         if (start2 + ml2 < mflimit)
             ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3);
         else ml3 = ml2;

         if (ml3 == ml2) // No better match : 2 sequences to encode
         {
             // ip & ref are known; Now for ml
             if (start2 < ip+ml)  ml = (int)(start2 - ip);
             // Now, encode 2 sequences
             ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
             ip = start2;
             ENCODE_SEQUENCE(&ip, &op, &anchor, ml2, ref2, oend);
             continue;
         }

         if (start3 < ip+ml+3) // Not enough space for match 2 : remove it
         {
             if (start3 >= (ip+ml)) // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
             {
                 if (start2 < ip+ml)
                 {
                     int correction = (int)(ip+ml - start2);
                     start2 += correction;
                     ref2 += correction;
                     ml2 -= correction;
                     if (ml2 < MINMATCH)
                     {
                         start2 = start3;
                         ref2 = ref3;
                         ml2 = ml3;
                     }
                 }

                 ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
                 ip  = start3;
                 ref = ref3;
                 ml  = ml3;

                 start0 = start2;
                 ref0 = ref2;
                 ml0 = ml2;
                 goto _Search2;
             }

             start2 = start3;
             ref2 = ref3;
             ml2 = ml3;
             goto _Search3;
         }

         // OK, now we have 3 ascending matches; let's write at least the first one
         // ip & ref are known; Now for ml
         if (start2 < ip+ml)
         {
             if ((start2 - ip) < (int)ML_MASK)
             {
                 int correction;
                 if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
                 if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
                 correction = ml - (int)(start2 - ip);
                 if (correction > 0)
                 {
                     start2 += correction;
                     ref2 += correction;
                     ml2 -= correction;
                 }
             }
             else
             {
                 ml = (int)(start2 - ip);
             }
         }
         ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);

         ip = start2;
         ref = ref2;
         ml = ml2;

         start2 = start3;
         ref2 = ref3;
         ml2 = ml3;

         goto _Search3;

     }

     // Encode Last Literals
     {
         int lastRun = (int)(iend - anchor);
 #ifdef LIMITED_OUTPUT
         if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0;  // Check output limit
 #endif
         if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
         else *op++ = (BYTE)(lastRun<<ML_BITS);
         memcpy(op, anchor, iend - anchor);
         op += iend-anchor;
     }

     // End
     return (int) (((char*)op)-dest);
 }


 int FUNCTION_NAME (const char* source,
                  char* dest,
                  int inputSize
 #ifdef LIMITED_OUTPUT
                 ,int maxOutputSize
 #endif
                 )
 {
     void* ctx = LZ4_createHC(source);
     int result;
     if (ctx==NULL) return 0;
 #ifdef LIMITED_OUTPUT
     result = COMBINED_NAME(FUNCTION_NAME,_continue) (ctx, source, dest, inputSize, maxOutputSize);
 #else
     result = COMBINED_NAME(FUNCTION_NAME,_continue) (ctx, source, dest, inputSize);
 #endif
     LZ4_freeHC(ctx);

     return result;
 }


 //****************************
 // Clean defines
 //****************************

 // Required defines
 #undef FUNCTION_NAME

 // Locally Generated
 #undef ENCODE_SEQUENCE
 #undef ENCODE_SEQUENCE_NAME

 // Optional defines
 #ifdef LIMITED_OUTPUT
 #undef LIMITED_OUTPUT
 #endif
	/*
	LZ4 HC Encoder - Part of LZ4 HC algorithm
	Copyright (C) 2011-2013, Yann Collet.
	BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are
	met:

	* Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above
	copyright notice, this list of conditions and the following disclaimer
	in the documentation and/or other materials provided with the
	distribution.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	You can contact the author at :
	- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
	- LZ4 source repository : http://code.google.com/p/lz4/
	*/

	/* lz4hc_encoder.h must be included into lz4hc.c
	The objective of this file is to create a single LZ4 compression function source
	which will be instanciated multiple times with minor variations
	depending on a set of #define.
	*/

	//****************************
	// Check required defines
	//****************************

	#ifndef FUNCTION_NAME
	# error "FUNTION_NAME is not defined"
	#endif


	//****************************
	// Local definitions
	//****************************
	#define COMBINED_NAME_RAW(n1,n2) n1 ## n2
	#define COMBINED_NAME(n1,n2) COMBINED_NAME_RAW(n1,n2)
	#define ENCODE_SEQUENCE_NAME COMBINED_NAME(FUNCTION_NAME,_encodeSequence)
	#ifdef LIMITED_OUTPUT
	# define ENCODE_SEQUENCE(i,o,a,m,r,d) if (ENCODE_SEQUENCE_NAME(i,o,a,m,r,d)) return 0;
	#else
	# define ENCODE_SEQUENCE(i,o,a,m,r,d) ENCODE_SEQUENCE_NAME(i,o,a,m,r)
	#endif

	//****************************
	// Function code
	//****************************

	forceinline int ENCODE_SEQUENCE_NAME (
	const BYTE** ip,
	BYTE** op,
	const BYTE** anchor,
	int matchLength,
	const BYTE* ref
	#ifdef LIMITED_OUTPUT
	,BYTE* oend
	#endif
	)
	{
	int length, len;
	BYTE* token;

	// Encode Literal length
	length = (int)(ip - anchor);
	token = (*op)++;
	#ifdef LIMITED_OUTPUT
	if ((*op + length + (2 + 1 + LASTLITERALS) + (length>>8)) > oend) return 1; // Check output limit
	#endif
	if (length>=(int)RUN_MASK) { token=(RUN_MASK<<ML_BITS); len = length-RUN_MASK; for(; len > 254 ; len-=255) (op)++ = 255; (*op)++ = (BYTE)len; }
	else *token = (BYTE)(length<<ML_BITS);

	// Copy Literals
	LZ4_BLINDCOPY(anchor, op, length);

	// Encode Offset
	LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));

	// Encode MatchLength
	length = (int)(matchLength-MINMATCH);
	#ifdef LIMITED_OUTPUT
	if (*op + (1 + LASTLITERALS) + (length>>8) > oend) return 1; // Check output limit
	#endif
	if (length>=(int)ML_MASK) { token+=ML_MASK; length-=ML_MASK; for(; length > 509 ; length-=510) { (op)++ = 255; (op)++ = 255; } if (length > 254) { length-=255; (op)++ = 255; } (*op)++ = (BYTE)length; }
	else *token += (BYTE)(length);

	// Prepare next loop
	*ip += matchLength;
	anchor = ip;

	return 0;
	}


	int COMBINED_NAME(FUNCTION_NAME,_continue) (
	void* ctxvoid,
	const char* source,
	char* dest,
	int inputSize
	#ifdef LIMITED_OUTPUT
	,int maxOutputSize
	#endif
	)
	{
	LZ4HC_Data_Structure* ctx = (LZ4HC_Data_Structure*) ctxvoid;
	const BYTE* ip = (const BYTE*) source;
	const BYTE* anchor = ip;
	const BYTE* const iend = ip + inputSize;
	const BYTE* const mflimit = iend - MFLIMIT;
	const BYTE* const matchlimit = (iend - LASTLITERALS);

	BYTE* op = (BYTE*) dest;
	#ifdef LIMITED_OUTPUT
	BYTE* const oend = op + maxOutputSize;
	#endif

	int ml, ml2, ml3, ml0;
	const BYTE* ref=NULL;
	const BYTE* start2=NULL;
	const BYTE* ref2=NULL;
	const BYTE* start3=NULL;
	const BYTE* ref3=NULL;
	const BYTE* start0;
	const BYTE* ref0;

	// Ensure blocks follow each other
	if (ip != ctx->end) return 0;
	ctx->end += inputSize;

	ip++;

	// Main Loop
	while (ip < mflimit)
	{
	ml = LZ4HC_InsertAndFindBestMatch (ctx, ip, matchlimit, (&ref));
	if (!ml) { ip++; continue; }

	// saved, in case we would skip too much
	start0 = ip;
	ref0 = ref;
	ml0 = ml;

	_Search2:
	if (ip+ml < mflimit)
	ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, ip + ml - 2, ip + 1, matchlimit, ml, &ref2, &start2);
	else ml2 = ml;

	if (ml2 == ml) // No better match
	{
	ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
	continue;
	}

	if (start0 < ip)
	{
	if (start2 < ip + ml0) // empirical
	{
	ip = start0;
	ref = ref0;
	ml = ml0;
	}
	}

	// Here, start0==ip
	if ((start2 - ip) < 3) // First Match too small : removed
	{
	ml = ml2;
	ip = start2;
	ref =ref2;
	goto _Search2;
	}

	_Search3:
	// Currently we have :
	// ml2 > ml1, and
	// ip1+3 <= ip2 (usually < ip1+ml1)
	if ((start2 - ip) < OPTIMAL_ML)
	{
	int correction;
	int new_ml = ml;
	if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML;
	if (ip+new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
	correction = new_ml - (int)(start2 - ip);
	if (correction > 0)
	{
	start2 += correction;
	ref2 += correction;
	ml2 -= correction;
	}
	}
	// Now, we have start2 = ip+new_ml, with new_ml = min(ml, OPTIMAL_ML=18)

	if (start2 + ml2 < mflimit)
	ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, start2 + ml2 - 3, start2, matchlimit, ml2, &ref3, &start3);
	else ml3 = ml2;

	if (ml3 == ml2) // No better match : 2 sequences to encode
	{
	// ip & ref are known; Now for ml
	if (start2 < ip+ml) ml = (int)(start2 - ip);
	// Now, encode 2 sequences
	ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
	ip = start2;
	ENCODE_SEQUENCE(&ip, &op, &anchor, ml2, ref2, oend);
	continue;
	}

	if (start3 < ip+ml+3) // Not enough space for match 2 : remove it
	{
	if (start3 >= (ip+ml)) // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
	{
	if (start2 < ip+ml)
	{
	int correction = (int)(ip+ml - start2);
	start2 += correction;
	ref2 += correction;
	ml2 -= correction;
	if (ml2 < MINMATCH)
	{
	start2 = start3;
	ref2 = ref3;
	ml2 = ml3;
	}
	}

	ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);
	ip = start3;
	ref = ref3;
	ml = ml3;

	start0 = start2;
	ref0 = ref2;
	ml0 = ml2;
	goto _Search2;
	}

	start2 = start3;
	ref2 = ref3;
	ml2 = ml3;
	goto _Search3;
	}

	// OK, now we have 3 ascending matches; let's write at least the first one
	// ip & ref are known; Now for ml
	if (start2 < ip+ml)
	{
	if ((start2 - ip) < (int)ML_MASK)
	{
	int correction;
	if (ml > OPTIMAL_ML) ml = OPTIMAL_ML;
	if (ip + ml > start2 + ml2 - MINMATCH) ml = (int)(start2 - ip) + ml2 - MINMATCH;
	correction = ml - (int)(start2 - ip);
	if (correction > 0)
	{
	start2 += correction;
	ref2 += correction;
	ml2 -= correction;
	}
	}
	else
	{
	ml = (int)(start2 - ip);
	}
	}
	ENCODE_SEQUENCE(&ip, &op, &anchor, ml, ref, oend);

	ip = start2;
	ref = ref2;
	ml = ml2;

	start2 = start3;
	ref2 = ref3;
	ml2 = ml3;

	goto _Search3;

	}

	// Encode Last Literals
	{
	int lastRun = (int)(iend - anchor);
	#ifdef LIMITED_OUTPUT
	if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0; // Check output limit
	#endif
	if (lastRun>=(int)RUN_MASK) { op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun > 254 ; lastRun-=255) op++ = 255; *op++ = (BYTE) lastRun; }
	else *op++ = (BYTE)(lastRun<<ML_BITS);
	memcpy(op, anchor, iend - anchor);
	op += iend-anchor;
	}

	// End
	return (int) (((char*)op)-dest);
	}


	int FUNCTION_NAME (const char* source,
	char* dest,
	int inputSize
	#ifdef LIMITED_OUTPUT
	,int maxOutputSize
	#endif
	)
	{
	void* ctx = LZ4_createHC(source);
	int result;
	if (ctx==NULL) return 0;
	#ifdef LIMITED_OUTPUT
	result = COMBINED_NAME(FUNCTION_NAME,_continue) (ctx, source, dest, inputSize, maxOutputSize);
	#else
	result = COMBINED_NAME(FUNCTION_NAME,_continue) (ctx, source, dest, inputSize);
	#endif
	LZ4_freeHC(ctx);

	return result;
	}


	//****************************
	// Clean defines
	//****************************

	// Required defines
	#undef FUNCTION_NAME

	// Locally Generated
	#undef ENCODE_SEQUENCE
	#undef ENCODE_SEQUENCE_NAME

	// Optional defines
	#ifdef LIMITED_OUTPUT
	#undef LIMITED_OUTPUT
	#endif