blob: 67a862ff4d67d200108fd1e5d642aef90dbf2dbb [file] [log] [blame]
import plpy
import hashlib
import struct
from struct import pack, unpack
from math import log
import base64
# import numpy as np
__ranges = 8*8 # INT64BITS
__depth = 8 # magic # of hash functions
__numcounters = 1024 # magic mod of hash function
__countmin_sz = __depth*__numcounters
__numsketches = __ranges
total_size = __numsketches * __countmin_sz
__max_int64 = (1L << 63) - 1
__min_int64 = __max_int64 * (-1)
def count(b64sketch, val):
if b64sketch is None:
plpy.error("Sketch error: NULL sketch is provided!")
try:
return __do_count(base64.b64decode(b64sketch), val)
except (TypeError, struct.error) as e:
plpy.error("Sketch error: wrongly formatted cmsketch object provided!")
def __do_count(all_sketch, val):
rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth) ]
return __do_count_rows(rows, val)
def __do_count_rows(rows, val):
m = hashlib.md5(pack('@q', val)).hexdigest()
# we have to flip the bytes around here
col_per_row = [int(m[i+2:i+4]+m[i:i+2],16) % __numcounters for i in range(0,__depth*4,4)]
counts = [rows[i][col_per_row[i]*8:col_per_row[i]*8+8] for i in range(0,__depth)]
return min([unpack('@q',x)[0] for x in counts])
def intlog2(x):
i = 0
while (x > 0):
x /= 2
i += 1
return i-1
#!
# convert an arbitrary range [bot-top] into a rangelist of dyadic ranges.
# E.g. convert 14-48 into [[14-15], [16-31], [32-47], [48-48]]
# To manage overflow issues, do not generate a range larger than 2^63-1!
# \param bot the bottom of the range (inclusive)
# \param top the top of the range (inclusive)
# \param r the list of ranges to be returned
#
def __find_ranges(bot, top):
# kick off the recursion at power RANGES-1
return __find_ranges_recursive(bot, top, __ranges-1, [])
#!
# find the ranges via recursive calls to this routine, pulling out smaller and
# smaller powers of 2
# To manage overflow issues, do not generate a range larger than 2^63-1!
# \param bot the bottom of the range (inclusive)
# \param top the top of the range (inclusive)
# \param power the highest power to start with
# \param r the rangelist to be returned
def __find_ranges_recursive(bot, top, power, r):
# Sanity check
if (top < bot or power < 0):
# skip. we get a bunch of off-by-one errors here right now
return r
if (top == bot):
# base case of recursion, a range of the form [x-x].
r.append([bot, bot])
return r
# a range that's 2^63 or larger will overflow int64's in the
# code below, so recurse to the left and right by hand
if (top >= 0 and bot < 0):
# don't even try to figure out how wide this range is, just split
r = __find_ranges_recursive(bot, -1, power-1, r)
r = __find_ranges_recursive(0, top, power-1, r)
return r
width = top - bot + 1L
# account for the fact that MIN and MAX are 1 off the true power of 2
if (top == __max_int64 or bot == __min_int64):
width += 1
dyad = intlog2(width)
if (dyad > 62):
# dangerously big, so split. we know that we don't span 0.
sign = -1 if (top < 0) else 1
r = __find_ranges_recursive(bot, (1L << 62)*sign - 1, 62, r)
r = __find_ranges_recursive((1L << 62)*sign, top, 62, r)
return r
# if we get here, we have a range of size 2 or greater.
# Find the largest dyadic range width in this range.
pow_dyad = 1L << dyad
# if this is a complete dyad
if (pow_dyad == width):
# if aligned, just append and return
if (bot % pow_dyad == 0):
r.append([bot,top])
return r
else:
# largest dyad that fits is 1 power-of-2 smaller
pow_dyad /= 2
if ((bot == __min_int64) or (bot % pow_dyad == 0)):
# our range is left-aligned on the dyad's min
r.append([bot, None])
if (top == __max_int64):
# special case: no need to recurse on right, and doing so
# could overflow, so handle by hand
r[-1][1] = __max_int64
else :
# -1 on end of next line because range to the right will
# start at the power-of-2 boundary.
r[-1][1] = (bot + pow_dyad - 1)
if (bot == __min_int64):
# account for fact that __min_int64 is 1 bigger than -2^63
r[-1][1] -= 1
newbot = r[-1][1] + 1
# recurse on right at finer grain
r = __find_ranges_recursive(newbot, top, power-1, r)
elif (top == __max_int64 or ((top+1) % pow_dyad) == 0):
# our range is right-aligned on the dyad's max.
r.append([None, top])
if (bot == __min_int64):
# special case: no need to recurse on left, and doing so
# could overflow, so handle by hand
r[-1][0] = __min_int64
else:
r[-1][0] = (top - pow_dyad + 1)
if (top == __max_int64):
# account for fact that __max_int64 is 1 smaller than 2^63
r[-1][0] += 1
newtop = r[-1][0] - 1
# recurse on left at finer grain.
r = __find_ranges_recursive(bot, newtop, power-1, r)
else:
# we straddle a power of 2
power_of_2 = pow_dyad*(top/pow_dyad)
# recurse on right at finer grain
r = __find_ranges_recursive(bot, power_of_2 - 1, power-1, r)
# recurse on left at finer grain
r = __find_ranges_recursive(power_of_2, top, power-1, r)
return r
def rangecount(b64sketch, bot, top):
if b64sketch is None:
plpy.error("Sketch error: NULL sketch is provided!")
if bot > top:
plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
str(bot), str(top)))
try:
return __do_rangecount(base64.b64decode(b64sketch), bot, top)
except (TypeError, struct.error) as e:
plpy.error("Sketch error: wrongly formatted cmsketch object provided!")
def __do_rangecount(all_sketch, bot, top):
cursum = 0
rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth*__ranges) ]
r = __find_ranges(bot, top)
# for obscure reasons, len(r) isn't working so use sum to compute
lenny = sum([1 for i in r])
# __find_ranges will not generate a span larger than 2^63-1, so
# we only need to consider 2^63 or less.
for i in range(0, lenny):
# What power of 2 is this range?
if (r[i][0] == __min_int64 and r[i][1] == -1):
# special case to avoid overflow: full range left of 0
dyad = 63
countval = -1
elif (r[i][0] == 0 and r[i][1] == __max_int64):
# special case to avoid overflow: full range right of 0
dyad = 63
countval = 0
else:
width = r[i][1] - r[i][0] + 1
if (r[i][0] == __min_int64):
width += 1
if (r[i][1] == __max_int64):
width += 1
# Divide min of range by 2^dyad and get count
dyad = intlog2(width)
countval = r[i][0] >> dyad
val = __do_count_rows(rows[dyad*__depth:(dyad+1)*__depth], countval)
cursum += val
return cursum
#!
# find the approximate centile in a cm sketch by binary search in the domain of values
# \param transval the current transition value
# \param intcentile the centile to return
# \param total the total count of items
def centile(b64sketch, intcentile, total):
if b64sketch is None:
plpy.error("Sketch error: NULL sketch is provided!")
if (intcentile <= 0 or intcentile >= 100):
plpy.error("Sketch error: invalid centile {0} provided!".format(
str(intcentile)))
if total < 0:
plpy.error("Sketch error: invalid count {0} is provided!".format(
str(total)))
try:
return __do_centile(base64.b64decode(b64sketch), intcentile, total)
except (TypeError, struct.error) as e:
plpy.error("Sketch error: wrongly formatted cmsketch object provided!")
def __do_centile(all_sketches, intcentile, total):
# est_cnt = __do_rangecount(all_sketches, __min_int64, __max_int64)
# if total != est_cnt:
# plpy.error("Sketch error: cnt is {0}, but sketch has {1}".format(
# str(total), str(est_cnt)))
centile_cnt = (total * intcentile/100.0);
loguess = __min_int64
higuess = __max_int64
while loguess < higuess:
curguess = loguess + int((higuess - loguess) / 2)
curcount = __do_rangecount(all_sketches, __min_int64, curguess)
if (curcount >= centile_cnt):
# overshot
higuess = curguess
else:
# undershot
loguess = curguess + 1
return loguess
def width_histogram(b64sketch, themin, themax, buckets):
if b64sketch is None:
plpy.error("Sketch error: NULL sketch is provided!")
if themin > themax:
plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
str(themin), str(themax)))
if buckets <= 0:
plpy.error("Sketch error: negative nbuckets {0} provided!".format(
str(buckets)))
try:
return __do_width_histo(base64.b64decode(b64sketch), themin, themax,
buckets)
except (TypeError, struct.error) as e:
plpy.error("Sketch error: wrongly formatted cmsketch object provided!")
def __do_width_histo(all_sketches, themin, themax, buckets):
step = int(float(themax-themin+1) / float(buckets))
step = 1 if step < 1 else step
histo = []
for i in range(0, buckets):
binlo = themin + i*step;
if (binlo > themax):
break
binhi = themax if (i == buckets-1) else (themin + (i+1)*step - 1)
binval = __do_rangecount(all_sketches, binlo, binhi)
histo.append([binlo,binhi,binval])
return histo
def depth_histogram(b64sketch, buckets):
if b64sketch is None:
plpy.error("Sketch error: NULL sketch is provided!")
if buckets <= 0:
plpy.error("Sketch error: negative nbuckets {0} provided!".format(
str(buckets)))
try:
return __do_depth_histo(base64.b64decode(b64sketch), buckets)
except (TypeError, struct.error) as e:
plpy.error("Sketch error: wrongly formatted cmsketch object provided!")
def __do_depth_histo(all_sketches, buckets):
step = int(100.0 / float(buckets))
step = 1 if step < 1 else step
total = __do_rangecount(all_sketches, __min_int64, __max_int64)
binlo = __min_int64
histo = []
for i in range(0, buckets):
if (i < buckets - 1):
cent = __do_centile(all_sketches, (i+1)*step, total);
if (i > 0 and cent <= histo[-1][1]):
# next centile is lower than previous; skip
continue;
histo.append([binlo,cent])
else:
# this is the top bucket
histo.append([binlo, __max_int64])
histo[-1].append(__do_rangecount(all_sketches,\
histo[-1][0], histo[-1][1]))
binlo = histo[-1][1] + 1;
return histo