| import plpy |
| import hashlib |
| import struct |
| from struct import pack, unpack |
| from math import log |
| import base64 |
| # import numpy as np |
| |
| __ranges = 8*8 # INT64BITS |
| __depth = 8 # magic # of hash functions |
| __numcounters = 1024 # magic mod of hash function |
| __countmin_sz = __depth*__numcounters |
| __numsketches = __ranges |
| total_size = __numsketches * __countmin_sz |
| __max_int64 = (1L << 63) - 1 |
| __min_int64 = __max_int64 * (-1) |
| |
| def count(b64sketch, val): |
| if b64sketch is None: |
| plpy.error("Sketch error: NULL sketch is provided!") |
| |
| try: |
| return __do_count(base64.b64decode(b64sketch), val) |
| except (TypeError, struct.error) as e: |
| plpy.error("Sketch error: wrongly formatted cmsketch object provided!") |
| |
| def __do_count(all_sketch, val): |
| rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth) ] |
| return __do_count_rows(rows, val) |
| |
| def __do_count_rows(rows, val): |
| m = hashlib.md5(pack('@q', val)).hexdigest() |
| |
| # we have to flip the bytes around here |
| col_per_row = [int(m[i+2:i+4]+m[i:i+2],16) % __numcounters for i in range(0,__depth*4,4)] |
| |
| counts = [rows[i][col_per_row[i]*8:col_per_row[i]*8+8] for i in range(0,__depth)] |
| |
| return min([unpack('@q',x)[0] for x in counts]) |
| |
| def intlog2(x): |
| i = 0 |
| while (x > 0): |
| x /= 2 |
| i += 1 |
| return i-1 |
| |
| #! |
| # convert an arbitrary range [bot-top] into a rangelist of dyadic ranges. |
| # E.g. convert 14-48 into [[14-15], [16-31], [32-47], [48-48]] |
| # To manage overflow issues, do not generate a range larger than 2^63-1! |
| # \param bot the bottom of the range (inclusive) |
| # \param top the top of the range (inclusive) |
| # \param r the list of ranges to be returned |
| # |
| def __find_ranges(bot, top): |
| # kick off the recursion at power RANGES-1 |
| return __find_ranges_recursive(bot, top, __ranges-1, []) |
| |
| #! |
| # find the ranges via recursive calls to this routine, pulling out smaller and |
| # smaller powers of 2 |
| # To manage overflow issues, do not generate a range larger than 2^63-1! |
| # \param bot the bottom of the range (inclusive) |
| # \param top the top of the range (inclusive) |
| # \param power the highest power to start with |
| # \param r the rangelist to be returned |
| def __find_ranges_recursive(bot, top, power, r): |
| # Sanity check |
| if (top < bot or power < 0): |
| # skip. we get a bunch of off-by-one errors here right now |
| return r |
| |
| if (top == bot): |
| # base case of recursion, a range of the form [x-x]. |
| r.append([bot, bot]) |
| return r |
| |
| # a range that's 2^63 or larger will overflow int64's in the |
| # code below, so recurse to the left and right by hand |
| if (top >= 0 and bot < 0): |
| # don't even try to figure out how wide this range is, just split |
| r = __find_ranges_recursive(bot, -1, power-1, r) |
| r = __find_ranges_recursive(0, top, power-1, r) |
| return r |
| |
| width = top - bot + 1L |
| |
| # account for the fact that MIN and MAX are 1 off the true power of 2 |
| if (top == __max_int64 or bot == __min_int64): |
| width += 1 |
| |
| dyad = intlog2(width) |
| if (dyad > 62): |
| # dangerously big, so split. we know that we don't span 0. |
| sign = -1 if (top < 0) else 1 |
| r = __find_ranges_recursive(bot, (1L << 62)*sign - 1, 62, r) |
| r = __find_ranges_recursive((1L << 62)*sign, top, 62, r) |
| return r |
| |
| # if we get here, we have a range of size 2 or greater. |
| # Find the largest dyadic range width in this range. |
| pow_dyad = 1L << dyad |
| |
| # if this is a complete dyad |
| if (pow_dyad == width): |
| # if aligned, just append and return |
| if (bot % pow_dyad == 0): |
| r.append([bot,top]) |
| return r |
| else: |
| # largest dyad that fits is 1 power-of-2 smaller |
| pow_dyad /= 2 |
| |
| if ((bot == __min_int64) or (bot % pow_dyad == 0)): |
| # our range is left-aligned on the dyad's min |
| r.append([bot, None]) |
| if (top == __max_int64): |
| # special case: no need to recurse on right, and doing so |
| # could overflow, so handle by hand |
| r[-1][1] = __max_int64 |
| else : |
| # -1 on end of next line because range to the right will |
| # start at the power-of-2 boundary. |
| r[-1][1] = (bot + pow_dyad - 1) |
| if (bot == __min_int64): |
| # account for fact that __min_int64 is 1 bigger than -2^63 |
| r[-1][1] -= 1 |
| newbot = r[-1][1] + 1 |
| # recurse on right at finer grain |
| r = __find_ranges_recursive(newbot, top, power-1, r) |
| elif (top == __max_int64 or ((top+1) % pow_dyad) == 0): |
| # our range is right-aligned on the dyad's max. |
| r.append([None, top]) |
| if (bot == __min_int64): |
| # special case: no need to recurse on left, and doing so |
| # could overflow, so handle by hand |
| r[-1][0] = __min_int64 |
| else: |
| r[-1][0] = (top - pow_dyad + 1) |
| if (top == __max_int64): |
| # account for fact that __max_int64 is 1 smaller than 2^63 |
| r[-1][0] += 1 |
| newtop = r[-1][0] - 1 |
| # recurse on left at finer grain. |
| r = __find_ranges_recursive(bot, newtop, power-1, r) |
| else: |
| # we straddle a power of 2 |
| power_of_2 = pow_dyad*(top/pow_dyad) |
| |
| # recurse on right at finer grain |
| r = __find_ranges_recursive(bot, power_of_2 - 1, power-1, r) |
| # recurse on left at finer grain |
| r = __find_ranges_recursive(power_of_2, top, power-1, r) |
| return r |
| |
| def rangecount(b64sketch, bot, top): |
| if b64sketch is None: |
| plpy.error("Sketch error: NULL sketch is provided!") |
| if bot > top: |
| plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format( |
| str(bot), str(top))) |
| |
| try: |
| return __do_rangecount(base64.b64decode(b64sketch), bot, top) |
| except (TypeError, struct.error) as e: |
| plpy.error("Sketch error: wrongly formatted cmsketch object provided!") |
| |
| def __do_rangecount(all_sketch, bot, top): |
| cursum = 0 |
| rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth*__ranges) ] |
| r = __find_ranges(bot, top) |
| # for obscure reasons, len(r) isn't working so use sum to compute |
| lenny = sum([1 for i in r]) |
| |
| # __find_ranges will not generate a span larger than 2^63-1, so |
| # we only need to consider 2^63 or less. |
| for i in range(0, lenny): |
| # What power of 2 is this range? |
| if (r[i][0] == __min_int64 and r[i][1] == -1): |
| # special case to avoid overflow: full range left of 0 |
| dyad = 63 |
| countval = -1 |
| elif (r[i][0] == 0 and r[i][1] == __max_int64): |
| # special case to avoid overflow: full range right of 0 |
| dyad = 63 |
| countval = 0 |
| else: |
| width = r[i][1] - r[i][0] + 1 |
| if (r[i][0] == __min_int64): |
| width += 1 |
| if (r[i][1] == __max_int64): |
| width += 1 |
| # Divide min of range by 2^dyad and get count |
| dyad = intlog2(width) |
| countval = r[i][0] >> dyad |
| val = __do_count_rows(rows[dyad*__depth:(dyad+1)*__depth], countval) |
| |
| cursum += val |
| return cursum |
| |
| |
| #! |
| # find the approximate centile in a cm sketch by binary search in the domain of values |
| # \param transval the current transition value |
| # \param intcentile the centile to return |
| # \param total the total count of items |
| def centile(b64sketch, intcentile, total): |
| if b64sketch is None: |
| plpy.error("Sketch error: NULL sketch is provided!") |
| if (intcentile <= 0 or intcentile >= 100): |
| plpy.error("Sketch error: invalid centile {0} provided!".format( |
| str(intcentile))) |
| if total < 0: |
| plpy.error("Sketch error: invalid count {0} is provided!".format( |
| str(total))) |
| |
| try: |
| return __do_centile(base64.b64decode(b64sketch), intcentile, total) |
| except (TypeError, struct.error) as e: |
| plpy.error("Sketch error: wrongly formatted cmsketch object provided!") |
| |
| def __do_centile(all_sketches, intcentile, total): |
| # est_cnt = __do_rangecount(all_sketches, __min_int64, __max_int64) |
| # if total != est_cnt: |
| # plpy.error("Sketch error: cnt is {0}, but sketch has {1}".format( |
| # str(total), str(est_cnt))) |
| |
| centile_cnt = (total * intcentile/100.0); |
| |
| loguess = __min_int64 |
| higuess = __max_int64 |
| while loguess < higuess: |
| curguess = loguess + int((higuess - loguess) / 2) |
| curcount = __do_rangecount(all_sketches, __min_int64, curguess) |
| if (curcount >= centile_cnt): |
| # overshot |
| higuess = curguess |
| else: |
| # undershot |
| loguess = curguess + 1 |
| return loguess |
| |
| |
| def width_histogram(b64sketch, themin, themax, buckets): |
| if b64sketch is None: |
| plpy.error("Sketch error: NULL sketch is provided!") |
| if themin > themax: |
| plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format( |
| str(themin), str(themax))) |
| if buckets <= 0: |
| plpy.error("Sketch error: negative nbuckets {0} provided!".format( |
| str(buckets))) |
| |
| try: |
| return __do_width_histo(base64.b64decode(b64sketch), themin, themax, |
| buckets) |
| except (TypeError, struct.error) as e: |
| plpy.error("Sketch error: wrongly formatted cmsketch object provided!") |
| |
| def __do_width_histo(all_sketches, themin, themax, buckets): |
| step = int(float(themax-themin+1) / float(buckets)) |
| step = 1 if step < 1 else step |
| histo = [] |
| for i in range(0, buckets): |
| binlo = themin + i*step; |
| if (binlo > themax): |
| break |
| binhi = themax if (i == buckets-1) else (themin + (i+1)*step - 1) |
| binval = __do_rangecount(all_sketches, binlo, binhi) |
| histo.append([binlo,binhi,binval]) |
| return histo |
| |
| def depth_histogram(b64sketch, buckets): |
| if b64sketch is None: |
| plpy.error("Sketch error: NULL sketch is provided!") |
| if buckets <= 0: |
| plpy.error("Sketch error: negative nbuckets {0} provided!".format( |
| str(buckets))) |
| |
| try: |
| return __do_depth_histo(base64.b64decode(b64sketch), buckets) |
| except (TypeError, struct.error) as e: |
| plpy.error("Sketch error: wrongly formatted cmsketch object provided!") |
| |
| def __do_depth_histo(all_sketches, buckets): |
| step = int(100.0 / float(buckets)) |
| step = 1 if step < 1 else step |
| total = __do_rangecount(all_sketches, __min_int64, __max_int64) |
| binlo = __min_int64 |
| histo = [] |
| |
| for i in range(0, buckets): |
| if (i < buckets - 1): |
| cent = __do_centile(all_sketches, (i+1)*step, total); |
| if (i > 0 and cent <= histo[-1][1]): |
| # next centile is lower than previous; skip |
| continue; |
| histo.append([binlo,cent]) |
| else: |
| # this is the top bucket |
| histo.append([binlo, __max_int64]) |
| histo[-1].append(__do_rangecount(all_sketches,\ |
| histo[-1][0], histo[-1][1])) |
| binlo = histo[-1][1] + 1; |
| return histo |