| /*! |
| * \file sketch_support.c |
| * |
| * \brief Support routines for managing bitmaps used in sketches |
| */ |
| |
| |
| /*! @addtogroup desc-stats |
| * |
| * @par About: |
| * MADlib provides a number of descriptive statistics to complement |
| * SQL's builtin aggregation functions (COUNT, SUM, MAX, MIN, AVG, STDDEV). |
| * When possible we try |
| * to provide high-peformance algorithms that run in a single (parallel) |
| * pass of the data without overflowing main memory. In some cases this |
| * is achieved by approximation |
| * algorithms (e.g. sketches) -- for those algorithms it's important to |
| * understand that answers are guaranteed mathematically to be within |
| * plus-or-minus a small epsilon of the right answer with high probability. |
| * It's always good to go back the research papers cited in the documents to |
| * understand the caveats involved. |
| * |
| * \par |
| * In this module you will find methods for: |
| * - order statistics (quantiles, median) |
| * - distinct counting |
| * - histogramming |
| * - frequent-value counting |
| */ |
| |
| /* |
| @addtogroup sketches |
| @todo |
| - Provide a relatively portable SQL-only implementation of CountMin. (FM bit manipulation won't port well regardless). |
| - Provide a python wrapper to the CountMin sketch output, and write scalar functions in python. |
| |
| |
| @bug |
| - <i>Equality-Testing Corner Case:</i> |
| For hashing, we convert values into text using the type's text output routine. |
| In general this should work fine, but there is the possibility that two |
| different values in the domain could have the same textual representation. |
| In these corner cases we will see incorrect counts for those values. |
| The proper way to do this is not to use the "outfunc", but rather to look up the |
| type-specific hash function as is done internally for hashjoin, hash indexes, |
| etc. The basic pattern for looking up the hash function in Postgres |
| internals is something like the following: |
| @code |
| get_sort_group_operators(dtype, false, true, false, <Opr, &eqOpr, >Opr); |
| success = get_op_hash_functions(eqOpr, result, NULL)); |
| @endcode |
| */ |
| /* THIS CODE MAY NEED TO BE REVISITED TO ENSURE ALIGNMENT! */ |
| |
| #include "postgres.h" |
| #include "utils/array.h" |
| #include "utils/elog.h" |
| #include "nodes/execnodes.h" |
| #include "fmgr.h" |
| #include "sketch_support.h" |
| #include "utils/builtins.h" |
| |
| /*! |
| * Simple linear function to find the rightmost bit that's set to one |
| * (i.e. the # of trailing zeros to the right). |
| * \param bits a bitmap containing many fm sketches |
| * \param numsketches the number of sketches in the bits variable |
| * \param the size of each sketch in bits |
| * \param the sketch number in which we want to find the rightmost one |
| */ |
| uint32 rightmost_one(uint8 *bits, |
| size_t numsketches, |
| size_t sketchsz_bits, |
| size_t sketchnum) |
| { |
| uint8 *s = |
| &(((uint8 *)(bits))[sketchnum*sketchsz_bits/8]); |
| int i; |
| uint32 c = 0; /* output: c will count trailing zero bits, */ |
| |
| if (sketchsz_bits % (sizeof(uint32)*CHAR_BIT)) |
| elog( |
| ERROR, |
| "number of bits per sketch is %u, must be a multiple of sizeof(uint32) = %u", |
| (uint32)sketchsz_bits, |
| (uint32)sizeof(uint32)); |
| |
| /* |
| * loop through the chunks of bits from right to left, counting zeros. |
| * stop when we hit a 1. |
| * XXX currently we look at CHAR_BIT (8) bits at a time |
| * which would seem to avoid any 32- vs. 64-bit concerns. |
| * might be worth tuning this up to do 32 bits at a time. |
| */ |
| for (i = sketchsz_bits/(CHAR_BIT) -1; i >= 0; i--) |
| { |
| uint32 v = (uint32) (s[i]); |
| |
| if (!v) /* all CHAR_BIT of these bits are 0 */ |
| c += CHAR_BIT /* * sizeof(uint32) */; |
| else |
| { |
| c += ui_rightmost_one(v); |
| break; /* we found a 1 in this value of i, so we stop looping here. */ |
| } |
| } |
| return c; |
| } |
| |
| /*! |
| * Simple linear function to find the leftmost zero (# leading 1's) |
| * Would be nice to unify with the previous -- e.g. a foomost_bar function |
| * where foo would be left or right, and bar would be 0 or 1. |
| * \param bits a bitmap containing many fm sketches |
| * \param numsketches the number of sketches in the bits variable |
| * \param the size of each sketch in bits |
| * \param the sketch number in which we want to find the rightmost one |
| */ |
| uint32 leftmost_zero(uint8 *bits, |
| size_t numsketches, |
| size_t sketchsz_bits, |
| size_t sketchnum) |
| { |
| uint8 * s = &(((uint8 *)bits)[sketchnum*sketchsz_bits/8]); |
| |
| unsigned i; |
| uint32 c = 0; /* output: c will count trailing zero bits, */ |
| uint32 maxbyte = pow(2,8) - 1; |
| |
| if (sketchsz_bits % (sizeof(uint32)*8)) |
| elog( |
| ERROR, |
| "number of bits per sketch is %u, must be a multiple of sizeof(uint32) = %u", |
| (uint32)sketchsz_bits, |
| (uint32)sizeof(uint32)); |
| |
| if (sketchsz_bits > numsketches*8) |
| elog(ERROR, "sketch sz declared at %u, but bitmap is only %u", |
| (uint32)sketchsz_bits, (uint32)numsketches*8); |
| |
| |
| /* |
| * loop through the chunks of bits from left to right, counting ones. |
| * stop when we hit a 0. |
| */ |
| for (i = 0; i < sketchsz_bits/(CHAR_BIT); i++) |
| { |
| uint32 v = (uint32) s[i]; |
| |
| if (v == maxbyte) |
| c += CHAR_BIT /* * sizeof(uint32) */; |
| else |
| { |
| /* |
| * reverse and invert v, then do rightmost_one |
| * magic reverse from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith32Bits |
| */ |
| uint8 b = |
| ((s[i] * 0x0802LU & |
| 0x22110LU) | |
| (s[i] * 0x8020LU & |
| 0x88440LU)) * 0x10101LU >> 16; |
| v = (uint32) b ^ 0xff; |
| |
| c += ui_rightmost_one(v); |
| break; /* we found a 1 in this value of i, so we stop looping here. */ |
| } |
| } |
| return c; |
| } |
| |
| |
| /* |
| * Given an array of n b-bit bitmaps, turn on the k'th most |
| * significant bit of the j'th bitmap. |
| * Both j and k are zero-indexed, BUT the bitmaps are indexed left-to-right, |
| * whereas significant bits are (of course!) right-to-left within the bitmap. |
| * |
| * This function makes destructive updates; the caller should make sure to check |
| * that we're being called in an agg context! |
| * \param bitmap an array of FM sketches |
| * \param numsketches # of sketches in the array |
| * \param sketchsz_bits # of BITS per sketch |
| * \param sketchnum index of the sketch to modify (from left, zero-indexed) |
| * \param bitnum bit offset (from right, zero-indexed) in that sketch |
| */ |
| Datum array_set_bit_in_place(bytea *bitmap, |
| int4 numsketches, |
| int4 sketchsz_bits, |
| int4 sketchnum, |
| int4 bitnum) |
| { |
| char mask; |
| char bytes_per_sketch = sketchsz_bits/CHAR_BIT; |
| |
| if (sketchnum >= numsketches || sketchnum < 0) |
| elog(ERROR, |
| "sketch offset exceeds the number of sketches (0-based)"); |
| if (bitnum >= sketchsz_bits || bitnum < 0) |
| elog( |
| ERROR, |
| "bit offset exceeds the number of bits per sketch (0-based)"); |
| if (sketchsz_bits % sizeof(uint32)) |
| elog( |
| ERROR, |
| "number of bits per sketch is %d, must be a multiple of sizeof(uint32) = %u", |
| sketchsz_bits, |
| (uint32)sizeof(uint32)); |
| |
| mask = 0x1 << bitnum%8; /* the bit to be modified within the proper byte (from the right) */ |
| ((char *)(VARDATA(bitmap)))[sketchnum*bytes_per_sketch /* left boundary of the proper sketch */ |
| + ( (bytes_per_sketch - 1) /* right boundary of the proper sketch */ |
| - bitnum/CHAR_BIT /* the byte to be modified (from the right) */ |
| ) |
| ] |
| |= mask; |
| |
| PG_RETURN_BYTEA_P(bitmap); |
| } |
| |
| /*! |
| * Simple linear function to find the rightmost one (# trailing zeros) in an uint32. |
| * Based on |
| * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightLinear |
| * Look there for fancier ways to do this. |
| * \param v an integer |
| */ |
| uint32 ui_rightmost_one(uint32 v) |
| { |
| uint32 c; |
| |
| v = (v ^ (v - 1)) >> 1; /* Set v's trailing 0s to 1s and zero rest */ |
| |
| for (c = 0; v; c++) |
| { |
| v >>= 1; |
| } |
| |
| return c; |
| } |
| |
| /*! |
| * the postgres internal md5 routine only provides public access to text output |
| * here we convert that text (in hex notation) back into bytes. |
| * postgres hex output has two hex characters for each 8-bit byte. |
| * so the output of this will be exactly half as many bytes as the input. |
| * \param hex a string encoding bytes in hex |
| * \param bytes out-value that will hold binary version of hex |
| * \param hexlen the length of the hex string |
| */ |
| void hex_to_bytes(char *hex, uint8 *bytes, size_t hexlen) |
| { |
| uint32 i; |
| |
| for (i = 0; i < hexlen; i+=2) /* +2 to consume 2 hex characters each time */ |
| { |
| char c1 = hex[i]; /* high-order bits */ |
| char c2 = hex[i+1]; /* low-order bits */ |
| int b1 = 0, b2 = 0; |
| |
| if (isdigit(c1)) b1 = c1 - '0'; |
| else if (c1 >= 'A' && c1 <= 'F') b1 = c1 - 'A' + 10; |
| else if (c1 >= 'a' && c1 <= 'f') b1 = c1 - 'a' + 10; |
| |
| if (isdigit(c2)) b2 = c2 - '0'; |
| else if (c2 >= 'A' && c2 <= 'F') b2 = c2 - 'A' + 10; |
| else if (c2 >= 'a' && c2 <= 'f') b2 = c2 - 'a' + 10; |
| |
| bytes[i/2] = b1*16 + b2; /* i/2 because our for loop is incrementing by 2 */ |
| } |
| |
| } |
| |
| |
| /*! debugging utility to output strings in binary */ |
| void |
| bit_print(uint8 *c, int numbytes) |
| { |
| int j, i, l; |
| char p[MD5_HASHLEN_BITS+2]; |
| uint32 n; |
| |
| for (j = 0, l=0; j < numbytes; j++) { |
| n = (uint32)c[j]; |
| i = 1<<(CHAR_BIT - 1); |
| while (i > 0) { |
| if (n & i) |
| p[l++] = '1'; |
| else |
| p[l++] = '0'; |
| i >>= 1; |
| } |
| p[l] = '\0'; |
| } |
| elog(NOTICE, "bitmap: %s", p); |
| } |
| |
| /*! |
| * The POSTGRES code for md5 returns a bytea with a textual representation of the |
| * md5 result. We then convert it back into binary. |
| * XXX The internal POSTGRES source code is actually converting from binary to the bytea |
| * so this is rather wasteful, but the internal code is marked static and unavailable here. |
| * XXX In fact, the full cost right now is: |
| * XXX -- outfunc converts internal type to CString |
| * XXX -- here we convert CString to text, to call md5_bytea |
| * XXX -- md5_bytea generates a byte array |
| * XXX -- we then call binary_decode to convert back to a textual representation |
| * XXX -- (alternatively we could call hex_to_bytes in sketch_support.c for the last step) |
| */ |
| Datum md5_datum(char *input) |
| { |
| Datum hashtxt = |
| DirectFunctionCall1(md5_bytea, PointerGetDatum(cstring_to_text(input))); |
| |
| return DirectFunctionCall2(binary_decode, hashtxt, |
| CStringGetTextDatum("hex")); |
| } |
| |
| |
| /* TEST ROUTINES */ |
| PG_FUNCTION_INFO_V1(sketch_array_set_bit_in_place); |
| Datum sketch_array_set_bit_in_place(PG_FUNCTION_ARGS); |
| PG_FUNCTION_INFO_V1(sketch_rightmost_one); |
| Datum sketch_rightmost_one(PG_FUNCTION_ARGS); |
| PG_FUNCTION_INFO_V1(sketch_leftmost_zero); |
| Datum sketch_leftmost_zero(PG_FUNCTION_ARGS); |
| |
| Datum sketch_rightmost_one(PG_FUNCTION_ARGS) |
| { |
| bytea *bitmap = (bytea *)PG_GETARG_BYTEA_P(0); |
| size_t sketchsz = PG_GETARG_INT32(1); /* size in bits */ |
| size_t sketchnum = PG_GETARG_INT32(2); /* from the left! */ |
| char * bits = VARDATA(bitmap); |
| size_t len = VARSIZE_ANY_EXHDR(bitmap); |
| |
| return rightmost_one((uint8 *)bits, len, sketchsz, sketchnum); |
| } |
| |
| Datum sketch_leftmost_zero(PG_FUNCTION_ARGS) |
| { |
| bytea *bitmap = (bytea *)PG_GETARG_BYTEA_P(0); |
| size_t sketchsz = PG_GETARG_INT32(1); /* size in bits */ |
| size_t sketchnum = PG_GETARG_INT32(2); /* from the left! */ |
| char * bits = VARDATA(bitmap); |
| size_t len = VARSIZE_ANY_EXHDR(bitmap); |
| |
| return leftmost_zero((uint8 *)bits, len, sketchsz, sketchnum); |
| } |
| |
| Datum sketch_array_set_bit_in_place(PG_FUNCTION_ARGS) |
| { |
| |
| bytea *bitmap = (bytea *)PG_GETARG_BYTEA_P(0); |
| int4 numsketches = PG_GETARG_INT32(1); |
| int4 sketchsz = PG_GETARG_INT32(2); /* size in bits */ |
| int4 sketchnum = PG_GETARG_INT32(3); /* from the left! */ |
| int4 bitnum = PG_GETARG_INT32(4); /* from the right! */ |
| |
| return array_set_bit_in_place(bitmap, |
| numsketches, |
| sketchsz, |
| sketchnum, |
| bitnum); |
| } |
| |
| /* In some cases with large numbers, log2 seems to round up incorrectly. */ |
| int4 safe_log2(int64 x) |
| { |
| int4 out = trunc(log2(x)); |
| |
| while ((((int64)1) << out) > x) |
| out--; |
| return out; |
| } |