cpc/src/fm85Util.cpp - datasketches-cpp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 // author Kevin Lang, Oath Research

 #include "fm85Util.h"

 #include <stdexcept>
 #include <new>

 extern void* (*fm85alloc)(size_t);

 /******************************************/

 void * shallowCopy (void * oldObject, size_t numBytes) {
   if (oldObject == NULL || numBytes == 0) throw std::invalid_argument("shallowCopyObject: bad argument");
   void * newObject = fm85alloc (numBytes);
   if (newObject == NULL) throw std::bad_alloc();
   memcpy (newObject, oldObject, numBytes);
   return (newObject);
 }

 /******************************************/

 U8 byteLeadingZerosTable[256];

 U8 slow_count_leading_zeros_in_byte (U8 the_byte)
 {
   int shift;
   int count = 0;
   for (shift = 7; shift >= 0; shift--) {
     int is_zero = !((the_byte >> shift) & 1);
     if (is_zero)
       count++;
     else
       break;
   }
   return count;
 }

 void fillByteLeadingZerosTable(void)
 {
   int j;
   for (j = 0; j < 256; j++)
     byteLeadingZerosTable[j] = slow_count_leading_zeros_in_byte ((U8) j);
 }

 /******************************************/

 #define FCLZ_MASK_56 ((U64) 0x00ffffffffffffff)
 #define FCLZ_MASK_48 ((U64) 0x0000ffffffffffff)
 #define FCLZ_MASK_40 ((U64) 0x000000ffffffffff)
 #define FCLZ_MASK_32 ((U64) 0x00000000ffffffff)
 #define FCLZ_MASK_24 ((U64) 0x0000000000ffffff)
 #define FCLZ_MASK_16 ((U64) 0x000000000000ffff)
 #define FCLZ_MASK_08 ((U64) 0x00000000000000ff)

 Short countLeadingZerosInUnsignedLong (U64 theInput) {
   if (theInput > FCLZ_MASK_56)
     return ((Short) ( 0 + byteLeadingZerosTable[(theInput >> 56) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_48)
     return ((Short) ( 8 + byteLeadingZerosTable[(theInput >> 48) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_40)
     return ((Short) (16 + byteLeadingZerosTable[(theInput >> 40) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_32)
     return ((Short) (24 + byteLeadingZerosTable[(theInput >> 32) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_24)
     return ((Short) (32 + byteLeadingZerosTable[(theInput >> 24) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_16)
     return ((Short) (40 + byteLeadingZerosTable[(theInput >> 16) & FCLZ_MASK_08]));
   if (theInput > FCLZ_MASK_08)
     return ((Short) (48 + byteLeadingZerosTable[(theInput >>  8) & FCLZ_MASK_08]));
   if (1)
     return ((Short) (56 + byteLeadingZerosTable[(theInput >>  0) & FCLZ_MASK_08]));
 }

 /******************************************/

 U8 byteTrailingZerosTable[256];

 // U8 lookupByteTrailingZeros (int x) { return (byteTrailingZerosTable[x]); }

 U8 slow_count_trailing_zeros_in_byte (U8 the_byte)
 {
   int shift;
   int count = 0;
   for (shift = 0; shift <= 7; shift++) {
     int is_zero = !((the_byte >> shift) & 1);
     if (is_zero)
       count++;
     else
       break;
   }
   return count;
 }

 void fillByteTrailingZerosTable(void)
 {
   int j;
   for (j = 0; j < 256; j++)
     byteTrailingZerosTable[j] = slow_count_trailing_zeros_in_byte ((U8) j);
 }

 Short countTrailingZerosInUnsignedLong (U64 theInput) {
   U64 tmp = theInput;
   int byte;
   int j = 0;
   for (j = 0; j < 8; j++) {
     byte = (tmp & 0xffULL);
     if (byte != 0) return ((Short) ((j << 3) + byteTrailingZerosTable[byte]));
     tmp >>= 8;
   }
   return ((Short) (64));
 }

 /******************************************/

 double invPow2Tab[256];

 void fillInvPow2Tab (void)
 {
   int j;
   for (j = 0; j < 256; j++) {
     invPow2Tab[j] = pow (2.0, (-1.0 * ((double) j)));
   }
 }

 /******************************************/

 double kxpByteLookup[256];

 void fillKxpByteLookup (void) // must call fillInvPow2Tab() first
 {
   int byte, col;
   for (byte = 0; byte < 256; byte++) {
     double sum = 0.0;
     for (col = 0; col < 8; col++) {
       int bit = (byte >> col) & 1;
       if (bit == 0) { // note the inverted logic
 	sum += invPow2Tab[col+1]; // note the "+1"
       }
     }
     kxpByteLookup[byte] = sum;
   }
 }


 /******************************************/

 Long divideLongsRoundingUp (Long x, Long y) {
   if (x < 0 || y <= 0) throw std::invalid_argument("divideLongsRoundingUp: bad argument");
   Long quotient = x / y;
   if (quotient * y == x) return (quotient);
   else return (quotient + 1);
 }

 Long longFloorLog2OfLong (Long x) {
   if (x < 1L) throw std::invalid_argument("longFloorLog2OfLong: bad argument");
   Long p = 0;
   Long y = 1;
  log2Loop:
   if (y == x) return (p);
   if (y  > x) return (p-1);
   p  += 1;
   y <<= 1;
   goto log2Loop;
 }

 /***********************************/

 // returns an integer that is between
 // zero and ceiling(log_2(k))-1, inclusive

 Long golombChooseNumberOfBaseBits (Long k, Long count) {
   if (k < 1L) throw std::invalid_argument("golombChooseNumberOfBaseBits: k < 1");
   if (count < 1L) throw std::invalid_argument("golombChooseNumberOfBaseBits: count < 1");
   Long quotient = (k - count) / count; // integer division
   if (quotient == 0) return (0);
   else return (longFloorLog2OfLong(quotient));
 }


 /*******************************************************/
 // This place-holder code was inadequate because it caused
 // the cost of the post-merge getResult() operation to be O(C)
 // instead of O(K). It did have the advantage of being
 // very simple and trustworthy during initial testing.

 Long wegnerCountBitsSetInMatrix (U64 * array, Long length) {
   Long i = 0;
   U64 pattern = 0;
   Long count = 0;
   //  clock_t t0, t1;
   //  t0 = clock();
 // Wegner's Bit-Counting Algorithm, CACM 3 (1960), p. 322.
   for (i = 0; i < length; i++) {
     pattern = array[i];
     while (pattern != 0) {
       pattern &= (pattern - 1);
       count++;
     }
   }
   //  t1 = clock();
   //  printf ("\n(Wegner CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
   //  fflush (stdout);

   return count;
 }

 /*******************************************************/
 // Note: this is an adaptation of the Java code that Lee sent me,
 // which is apparently a variation of Figure 5-2 in "Hacker's Delight"
 // by Henry S. Warren.

 static inline Long warrenBitCount(U64 i) {
   i = i - ((i >> 1) & 0x5555555555555555ULL);
   i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
   i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
   i = i + (i >> 8);
   i = i + (i >> 16);
   i = i + (i >> 32);
   return (Long)i & 0x7f;
 }

 Long warrenCountBitsSetInMatrix (U64 * array, Long length) {
   Long i = 0;
   Long count = 0;
   for (i = 0; i < length; i++) {
     count += warrenBitCount(array[i]);
   }
   return count;
 }

 /*******************************************************/
 // This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.

 #define CSA(h,l,a,b,c) {U64 u = a^b; U64 v = c; h = (a&b) | (u&v); l = u^v;}

 Long countBitsSetInMatrix (U64 * A, Long length) {
   if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
   Long tot, i;
   U64 ones, twos, twosA, twosB, fours, foursA, foursB, eights;
   tot = 0;
   fours = twos = ones = 0;

   for (i = 0; i <= length - 8; i = i + 8) {
     CSA(twosA, ones, ones, A[i+0], A[i+1]);
     CSA(twosB, ones, ones, A[i+2], A[i+3]);
     CSA(foursA, twos, twos, twosA, twosB);

     CSA(twosA, ones, ones, A[i+4], A[i+5]);
     CSA(twosB, ones, ones, A[i+6], A[i+7]);
     CSA(foursB, twos, twos, twosA, twosB);

     CSA(eights, fours, fours, foursA, foursB);

     tot += warrenBitCount(eights);
   }
   tot = 8*tot + 4*warrenBitCount(fours) + 2*warrenBitCount(twos) + warrenBitCount(ones);

   // Because I still don't fully trust this fancy version
   // assert(tot == wegnerCountBitsSetInMatrix(A, length));

   return (tot);
 }

 /*********************************************/
 // Here are some timings made with quickTestMerge.c
 // for the "5 5" case:

 // Wegner CountBitsTime 29.3
 // Warren CountBitsTime  5.3
 // CSA    CountBitsTime  4.3
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	// author Kevin Lang, Oath Research

	#include "fm85Util.h"

	#include <stdexcept>
	#include <new>

	extern void* (*fm85alloc)(size_t);

	/******************************************/

	void * shallowCopy (void * oldObject, size_t numBytes) {
	if (oldObject == NULL \|\| numBytes == 0) throw std::invalid_argument("shallowCopyObject: bad argument");
	void * newObject = fm85alloc (numBytes);
	if (newObject == NULL) throw std::bad_alloc();
	memcpy (newObject, oldObject, numBytes);
	return (newObject);
	}

	/******************************************/

	U8 byteLeadingZerosTable[256];

	U8 slow_count_leading_zeros_in_byte (U8 the_byte)
	{
	int shift;
	int count = 0;
	for (shift = 7; shift >= 0; shift--) {
	int is_zero = !((the_byte >> shift) & 1);
	if (is_zero)
	count++;
	else
	break;
	}
	return count;
	}

	void fillByteLeadingZerosTable(void)
	{
	int j;
	for (j = 0; j < 256; j++)
	byteLeadingZerosTable[j] = slow_count_leading_zeros_in_byte ((U8) j);
	}

	/******************************************/

	#define FCLZ_MASK_56 ((U64) 0x00ffffffffffffff)
	#define FCLZ_MASK_48 ((U64) 0x0000ffffffffffff)
	#define FCLZ_MASK_40 ((U64) 0x000000ffffffffff)
	#define FCLZ_MASK_32 ((U64) 0x00000000ffffffff)
	#define FCLZ_MASK_24 ((U64) 0x0000000000ffffff)
	#define FCLZ_MASK_16 ((U64) 0x000000000000ffff)
	#define FCLZ_MASK_08 ((U64) 0x00000000000000ff)

	Short countLeadingZerosInUnsignedLong (U64 theInput) {
	if (theInput > FCLZ_MASK_56)
	return ((Short) ( 0 + byteLeadingZerosTable[(theInput >> 56) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_48)
	return ((Short) ( 8 + byteLeadingZerosTable[(theInput >> 48) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_40)
	return ((Short) (16 + byteLeadingZerosTable[(theInput >> 40) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_32)
	return ((Short) (24 + byteLeadingZerosTable[(theInput >> 32) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_24)
	return ((Short) (32 + byteLeadingZerosTable[(theInput >> 24) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_16)
	return ((Short) (40 + byteLeadingZerosTable[(theInput >> 16) & FCLZ_MASK_08]));
	if (theInput > FCLZ_MASK_08)
	return ((Short) (48 + byteLeadingZerosTable[(theInput >> 8) & FCLZ_MASK_08]));
	if (1)
	return ((Short) (56 + byteLeadingZerosTable[(theInput >> 0) & FCLZ_MASK_08]));
	}

	/******************************************/

	U8 byteTrailingZerosTable[256];

	// U8 lookupByteTrailingZeros (int x) { return (byteTrailingZerosTable[x]); }

	U8 slow_count_trailing_zeros_in_byte (U8 the_byte)
	{
	int shift;
	int count = 0;
	for (shift = 0; shift <= 7; shift++) {
	int is_zero = !((the_byte >> shift) & 1);
	if (is_zero)
	count++;
	else
	break;
	}
	return count;
	}

	void fillByteTrailingZerosTable(void)
	{
	int j;
	for (j = 0; j < 256; j++)
	byteTrailingZerosTable[j] = slow_count_trailing_zeros_in_byte ((U8) j);
	}

	Short countTrailingZerosInUnsignedLong (U64 theInput) {
	U64 tmp = theInput;
	int byte;
	int j = 0;
	for (j = 0; j < 8; j++) {
	byte = (tmp & 0xffULL);
	if (byte != 0) return ((Short) ((j << 3) + byteTrailingZerosTable[byte]));
	tmp >>= 8;
	}
	return ((Short) (64));
	}

	/******************************************/

	double invPow2Tab[256];

	void fillInvPow2Tab (void)
	{
	int j;
	for (j = 0; j < 256; j++) {
	invPow2Tab[j] = pow (2.0, (-1.0 * ((double) j)));
	}
	}

	/******************************************/

	double kxpByteLookup[256];

	void fillKxpByteLookup (void) // must call fillInvPow2Tab() first
	{
	int byte, col;
	for (byte = 0; byte < 256; byte++) {
	double sum = 0.0;
	for (col = 0; col < 8; col++) {
	int bit = (byte >> col) & 1;
	if (bit == 0) { // note the inverted logic
	sum += invPow2Tab[col+1]; // note the "+1"
	}
	}
	kxpByteLookup[byte] = sum;
	}
	}


	/******************************************/

	Long divideLongsRoundingUp (Long x, Long y) {
	if (x < 0 \|\| y <= 0) throw std::invalid_argument("divideLongsRoundingUp: bad argument");
	Long quotient = x / y;
	if (quotient * y == x) return (quotient);
	else return (quotient + 1);
	}

	Long longFloorLog2OfLong (Long x) {
	if (x < 1L) throw std::invalid_argument("longFloorLog2OfLong: bad argument");
	Long p = 0;
	Long y = 1;
	log2Loop:
	if (y == x) return (p);
	if (y > x) return (p-1);
	p += 1;
	y <<= 1;
	goto log2Loop;
	}

	/***********************************/

	// returns an integer that is between
	// zero and ceiling(log_2(k))-1, inclusive

	Long golombChooseNumberOfBaseBits (Long k, Long count) {
	if (k < 1L) throw std::invalid_argument("golombChooseNumberOfBaseBits: k < 1");
	if (count < 1L) throw std::invalid_argument("golombChooseNumberOfBaseBits: count < 1");
	Long quotient = (k - count) / count; // integer division
	if (quotient == 0) return (0);
	else return (longFloorLog2OfLong(quotient));
	}


	/*******************************************************/
	// This place-holder code was inadequate because it caused
	// the cost of the post-merge getResult() operation to be O(C)
	// instead of O(K). It did have the advantage of being
	// very simple and trustworthy during initial testing.

	Long wegnerCountBitsSetInMatrix (U64 * array, Long length) {
	Long i = 0;
	U64 pattern = 0;
	Long count = 0;
	// clock_t t0, t1;
	// t0 = clock();
	// Wegner's Bit-Counting Algorithm, CACM 3 (1960), p. 322.
	for (i = 0; i < length; i++) {
	pattern = array[i];
	while (pattern != 0) {
	pattern &= (pattern - 1);
	count++;
	}
	}
	// t1 = clock();
	// printf ("\n(Wegner CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
	// fflush (stdout);

	return count;
	}

	/*******************************************************/
	// Note: this is an adaptation of the Java code that Lee sent me,
	// which is apparently a variation of Figure 5-2 in "Hacker's Delight"
	// by Henry S. Warren.

	static inline Long warrenBitCount(U64 i) {
	i = i - ((i >> 1) & 0x5555555555555555ULL);
	i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
	i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
	i = i + (i >> 8);
	i = i + (i >> 16);
	i = i + (i >> 32);
	return (Long)i & 0x7f;
	}

	Long warrenCountBitsSetInMatrix (U64 * array, Long length) {
	Long i = 0;
	Long count = 0;
	for (i = 0; i < length; i++) {
	count += warrenBitCount(array[i]);
	}
	return count;
	}

	/*******************************************************/
	// This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.

	#define CSA(h,l,a,b,c) {U64 u = a^b; U64 v = c; h = (a&b) \| (u&v); l = u^v;}

	Long countBitsSetInMatrix (U64 * A, Long length) {
	if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
	Long tot, i;
	U64 ones, twos, twosA, twosB, fours, foursA, foursB, eights;
	tot = 0;
	fours = twos = ones = 0;

	for (i = 0; i <= length - 8; i = i + 8) {
	CSA(twosA, ones, ones, A[i+0], A[i+1]);
	CSA(twosB, ones, ones, A[i+2], A[i+3]);
	CSA(foursA, twos, twos, twosA, twosB);

	CSA(twosA, ones, ones, A[i+4], A[i+5]);
	CSA(twosB, ones, ones, A[i+6], A[i+7]);
	CSA(foursB, twos, twos, twosA, twosB);

	CSA(eights, fours, fours, foursA, foursB);

	tot += warrenBitCount(eights);
	}
	tot = 8tot + 4warrenBitCount(fours) + 2*warrenBitCount(twos) + warrenBitCount(ones);

	// Because I still don't fully trust this fancy version
	// assert(tot == wegnerCountBitsSetInMatrix(A, length));

	return (tot);
	}

	/*********************************************/
	// Here are some timings made with quickTestMerge.c
	// for the "5 5" case:

	// Wegner CountBitsTime 29.3
	// Warren CountBitsTime 5.3
	// CSA CountBitsTime 4.3