methods/sketch/src/pg_gp/countmin.py_in - madlib - Git at Google

 import plpy
 import hashlib
 import struct
 from struct import pack, unpack
 from math import log
 import base64
 # import numpy as np

 __ranges = 8*8 # INT64BITS
 __depth = 8 # magic # of hash functions
 __numcounters = 1024 # magic mod of hash function
 __countmin_sz = __depth*__numcounters
 __numsketches = __ranges
 total_size = __numsketches * __countmin_sz
 __max_int64 = (1L << 63) - 1
 __min_int64 = __max_int64 * (-1)

 def count(b64sketch, val):
     if b64sketch is None:
         plpy.error("Sketch error: NULL sketch is provided!")

     try:
         return __do_count(base64.b64decode(b64sketch), val)
     except (TypeError, struct.error) as e:
         plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

 def __do_count(all_sketch, val):
     rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth) ]
     return __do_count_rows(rows, val)

 def __do_count_rows(rows, val):
     m = hashlib.md5(pack('@q', val)).hexdigest()

     # we have to flip the bytes around here
     col_per_row = [int(m[i+2:i+4]+m[i:i+2],16) % __numcounters for i in range(0,__depth*4,4)]

     counts = [rows[i][col_per_row[i]*8:col_per_row[i]*8+8] for i in range(0,__depth)]

     return min([unpack('@q',x)[0] for x in counts])

 def intlog2(x):
   i = 0
   while (x > 0):
     x /= 2
     i += 1
   return i-1

 #!
 # convert an arbitrary range [bot-top] into a rangelist of dyadic ranges.
 # E.g. convert 14-48 into [[14-15], [16-31], [32-47], [48-48]]
 # To manage overflow issues, do not generate a range larger than 2^63-1!
 # \param bot the bottom of the range (inclusive)
 # \param top the top of the range (inclusive)
 # \param r the list of ranges to be returned
 #
 def __find_ranges(bot, top):
     # kick off the recursion at power RANGES-1
     return __find_ranges_recursive(bot, top, __ranges-1, [])

 #!
 # find the ranges via recursive calls to this routine, pulling out smaller and
 # smaller powers of 2
 # To manage overflow issues, do not generate a range larger than 2^63-1!
 # \param bot the bottom of the range (inclusive)
 # \param top the top of the range (inclusive)
 # \param power the highest power to start with
 # \param r the rangelist to be returned
 def __find_ranges_recursive(bot, top, power, r):
     # Sanity check
     if (top < bot or power < 0):
         # skip.  we get a bunch of off-by-one errors here right now
         return r

     if (top == bot):
         # base case of recursion, a range of the form [x-x].
         r.append([bot, bot])
         return r

     # a range that's 2^63 or larger will overflow int64's in the
     # code below, so recurse to the left and right by hand
     if (top >= 0 and bot < 0):
         # don't even try to figure out how wide this range is, just split
         r = __find_ranges_recursive(bot, -1, power-1, r)
         r = __find_ranges_recursive(0, top, power-1, r)
         return r

     width = top - bot + 1L

     # account for the fact that MIN and MAX are 1 off the true power of 2
     if (top == __max_int64 or bot == __min_int64):
         width += 1

     dyad = intlog2(width)
     if (dyad > 62):
         # dangerously big, so split.  we know that we don't span 0.
         sign = -1 if (top < 0) else 1
         r = __find_ranges_recursive(bot, (1L << 62)*sign - 1, 62, r)
         r = __find_ranges_recursive((1L << 62)*sign, top, 62, r)
         return r

     # if we get here, we have a range of size 2 or greater.
     # Find the largest dyadic range width in this range.
     pow_dyad = 1L << dyad

     # if this is a complete dyad
     if (pow_dyad == width):
         # if aligned, just append and return
         if (bot % pow_dyad == 0):
             r.append([bot,top])
             return r
         else:
         # largest dyad that fits is 1 power-of-2 smaller
             pow_dyad /= 2

     if ((bot == __min_int64) or (bot % pow_dyad == 0)):
         # our range is left-aligned on the dyad's min
         r.append([bot, None])
         if (top == __max_int64):
             # special case: no need to recurse on right, and doing so
             # could overflow, so handle by hand
             r[-1][1] = __max_int64
         else :
             # -1 on end of next line because range to the right will
             # start at the power-of-2 boundary.
             r[-1][1] = (bot + pow_dyad - 1)
             if (bot == __min_int64):
                 # account for fact that __min_int64 is 1 bigger than -2^63
                 r[-1][1] -= 1
             newbot = r[-1][1] + 1
             # recurse on right at finer grain
             r = __find_ranges_recursive(newbot, top, power-1, r)
     elif (top == __max_int64 or ((top+1) % pow_dyad) == 0):
         # our range is right-aligned on the dyad's max.
         r.append([None, top])
         if (bot == __min_int64):
             # special case: no need to recurse on left, and doing so
             # could overflow, so handle by hand
             r[-1][0] = __min_int64
         else:
             r[-1][0] = (top - pow_dyad + 1)
             if (top == __max_int64):
                 # account for fact that __max_int64 is 1 smaller than 2^63
                 r[-1][0] += 1
             newtop = r[-1][0] - 1
             # recurse on left at finer grain.
             r = __find_ranges_recursive(bot, newtop, power-1, r)
     else:
         # we straddle a power of 2
         power_of_2 = pow_dyad*(top/pow_dyad)

         # recurse on right at finer grain
         r = __find_ranges_recursive(bot, power_of_2 - 1, power-1, r)
         # recurse on left at finer grain
         r = __find_ranges_recursive(power_of_2, top, power-1, r)
     return r

 def rangecount(b64sketch, bot, top):
     if b64sketch is None:
         plpy.error("Sketch error: NULL sketch is provided!")
     if bot > top:
         plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
             str(bot), str(top)))

     try:
         return __do_rangecount(base64.b64decode(b64sketch), bot, top)
     except (TypeError, struct.error) as e:
         plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

 def __do_rangecount(all_sketch, bot, top):
     cursum = 0
     rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth*__ranges) ]
     r = __find_ranges(bot, top)
 		# for obscure reasons, len(r) isn't working so use sum to compute
     lenny = sum([1 for i in r])

     # __find_ranges will not generate a span larger than 2^63-1, so
     # we only need to consider 2^63 or less.
     for i in range(0, lenny):
         # What power of 2 is this range?
         if (r[i][0] == __min_int64 and r[i][1] == -1):
             # special case to avoid overflow: full range left of 0
             dyad = 63
             countval = -1
         elif (r[i][0] == 0 and r[i][1] == __max_int64):
             # special case to avoid overflow: full range right of 0
             dyad = 63
             countval = 0
         else:
             width = r[i][1] - r[i][0] + 1
             if (r[i][0] == __min_int64):
                 width += 1
             if (r[i][1] == __max_int64):
                 width += 1
             # Divide min of range by 2^dyad and get count
             dyad = intlog2(width)
             countval = r[i][0] >> dyad
         val = __do_count_rows(rows[dyad*__depth:(dyad+1)*__depth], countval)

         cursum += val
     return cursum


 #!
 # find the approximate centile in a cm sketch by binary search in the domain of values
 # \param transval the current transition value
 # \param intcentile the centile to return
 # \param total the total count of items
 def centile(b64sketch, intcentile, total):
     if b64sketch is None:
         plpy.error("Sketch error: NULL sketch is provided!")
     if (intcentile <= 0 or intcentile >= 100):
         plpy.error("Sketch error: invalid centile {0} provided!".format(
             str(intcentile)))
     if total < 0:
         plpy.error("Sketch error: invalid count {0} is provided!".format(
             str(total)))

     try:
         return __do_centile(base64.b64decode(b64sketch), intcentile, total)
     except (TypeError, struct.error) as e:
         plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

 def __do_centile(all_sketches, intcentile, total):
     # est_cnt = __do_rangecount(all_sketches, __min_int64, __max_int64)
     # if total != est_cnt:
     #     plpy.error("Sketch error: cnt is {0}, but sketch has {1}".format(
     #         str(total), str(est_cnt)))

     centile_cnt = (total * intcentile/100.0);

     loguess = __min_int64
     higuess = __max_int64
     while loguess < higuess:
         curguess = loguess + int((higuess - loguess) / 2)
         curcount = __do_rangecount(all_sketches, __min_int64, curguess)
         if (curcount >= centile_cnt):
             # overshot
             higuess = curguess
         else:
             # undershot
             loguess = curguess + 1
     return loguess


 def width_histogram(b64sketch, themin, themax, buckets):
     if b64sketch is None:
         plpy.error("Sketch error: NULL sketch is provided!")
     if themin > themax:
         plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
             str(themin), str(themax)))
     if buckets <= 0:
         plpy.error("Sketch error: negative nbuckets {0} provided!".format(
             str(buckets)))

     try:
         return __do_width_histo(base64.b64decode(b64sketch), themin, themax,
                 buckets)
     except (TypeError, struct.error) as e:
         plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

 def __do_width_histo(all_sketches, themin, themax, buckets):
     step = int(float(themax-themin+1) / float(buckets))
     step = 1 if step < 1 else step
     histo = []
     for i in range(0, buckets):
         binlo = themin + i*step;
         if (binlo > themax):
             break
         binhi = themax if (i == buckets-1) else (themin + (i+1)*step - 1)
         binval = __do_rangecount(all_sketches, binlo, binhi)
         histo.append([binlo,binhi,binval])
     return histo

 def depth_histogram(b64sketch, buckets):
     if b64sketch is None:
         plpy.error("Sketch error: NULL sketch is provided!")
     if buckets <= 0:
         plpy.error("Sketch error: negative nbuckets {0} provided!".format(
             str(buckets)))

     try:
         return __do_depth_histo(base64.b64decode(b64sketch), buckets)
     except (TypeError, struct.error) as e:
         plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

 def __do_depth_histo(all_sketches, buckets):
     step = int(100.0 / float(buckets))
     step = 1 if step < 1 else step
     total = __do_rangecount(all_sketches, __min_int64, __max_int64)
     binlo = __min_int64
     histo = []

     for i in range(0, buckets):
         if (i < buckets - 1):
             cent = __do_centile(all_sketches, (i+1)*step, total);
             if (i > 0 and cent <= histo[-1][1]):
                 # next centile is lower than previous; skip
                 continue;
             histo.append([binlo,cent])
         else:
             # this is the top bucket
             histo.append([binlo, __max_int64])
         histo[-1].append(__do_rangecount(all_sketches,\
                                     histo[-1][0], histo[-1][1]))
         binlo = histo[-1][1] + 1;
     return histo
	import plpy
	import hashlib
	import struct
	from struct import pack, unpack
	from math import log
	import base64
	# import numpy as np

	__ranges = 8*8 # INT64BITS
	__depth = 8 # magic # of hash functions
	__numcounters = 1024 # magic mod of hash function
	__countmin_sz = __depth*__numcounters
	__numsketches = __ranges
	total_size = __numsketches * __countmin_sz
	__max_int64 = (1L << 63) - 1
	__min_int64 = __max_int64 * (-1)

	def count(b64sketch, val):
	if b64sketch is None:
	plpy.error("Sketch error: NULL sketch is provided!")

	try:
	return __do_count(base64.b64decode(b64sketch), val)
	except (TypeError, struct.error) as e:
	plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

	def __do_count(all_sketch, val):
	rows = [ all_sketch[i__countmin_sz:(i+1)__countmin_sz] for i in range(0,__depth) ]
	return __do_count_rows(rows, val)

	def __do_count_rows(rows, val):
	m = hashlib.md5(pack('@q', val)).hexdigest()

	# we have to flip the bytes around here
	col_per_row = [int(m[i+2:i+4]+m[i:i+2],16) % __numcounters for i in range(0,__depth*4,4)]

	counts = [rows[i][col_per_row[i]8:col_per_row[i]8+8] for i in range(0,__depth)]

	return min([unpack('@q',x)[0] for x in counts])

	def intlog2(x):
	i = 0
	while (x > 0):
	x /= 2
	i += 1
	return i-1

	#!
	# convert an arbitrary range [bot-top] into a rangelist of dyadic ranges.
	# E.g. convert 14-48 into [[14-15], [16-31], [32-47], [48-48]]
	# To manage overflow issues, do not generate a range larger than 2^63-1!
	# \param bot the bottom of the range (inclusive)
	# \param top the top of the range (inclusive)
	# \param r the list of ranges to be returned
	#
	def __find_ranges(bot, top):
	# kick off the recursion at power RANGES-1
	return __find_ranges_recursive(bot, top, __ranges-1, [])

	#!
	# find the ranges via recursive calls to this routine, pulling out smaller and
	# smaller powers of 2
	# To manage overflow issues, do not generate a range larger than 2^63-1!
	# \param bot the bottom of the range (inclusive)
	# \param top the top of the range (inclusive)
	# \param power the highest power to start with
	# \param r the rangelist to be returned
	def __find_ranges_recursive(bot, top, power, r):
	# Sanity check
	if (top < bot or power < 0):
	# skip. we get a bunch of off-by-one errors here right now
	return r

	if (top == bot):
	# base case of recursion, a range of the form [x-x].
	r.append([bot, bot])
	return r

	# a range that's 2^63 or larger will overflow int64's in the
	# code below, so recurse to the left and right by hand
	if (top >= 0 and bot < 0):
	# don't even try to figure out how wide this range is, just split
	r = __find_ranges_recursive(bot, -1, power-1, r)
	r = __find_ranges_recursive(0, top, power-1, r)
	return r

	width = top - bot + 1L

	# account for the fact that MIN and MAX are 1 off the true power of 2
	if (top == __max_int64 or bot == __min_int64):
	width += 1

	dyad = intlog2(width)
	if (dyad > 62):
	# dangerously big, so split. we know that we don't span 0.
	sign = -1 if (top < 0) else 1
	r = __find_ranges_recursive(bot, (1L << 62)*sign - 1, 62, r)
	r = __find_ranges_recursive((1L << 62)*sign, top, 62, r)
	return r

	# if we get here, we have a range of size 2 or greater.
	# Find the largest dyadic range width in this range.
	pow_dyad = 1L << dyad

	# if this is a complete dyad
	if (pow_dyad == width):
	# if aligned, just append and return
	if (bot % pow_dyad == 0):
	r.append([bot,top])
	return r
	else:
	# largest dyad that fits is 1 power-of-2 smaller
	pow_dyad /= 2

	if ((bot == __min_int64) or (bot % pow_dyad == 0)):
	# our range is left-aligned on the dyad's min
	r.append([bot, None])
	if (top == __max_int64):
	# special case: no need to recurse on right, and doing so
	# could overflow, so handle by hand
	r[-1][1] = __max_int64
	else :
	# -1 on end of next line because range to the right will
	# start at the power-of-2 boundary.
	r[-1][1] = (bot + pow_dyad - 1)
	if (bot == __min_int64):
	# account for fact that __min_int64 is 1 bigger than -2^63
	r[-1][1] -= 1
	newbot = r[-1][1] + 1
	# recurse on right at finer grain
	r = __find_ranges_recursive(newbot, top, power-1, r)
	elif (top == __max_int64 or ((top+1) % pow_dyad) == 0):
	# our range is right-aligned on the dyad's max.
	r.append([None, top])
	if (bot == __min_int64):
	# special case: no need to recurse on left, and doing so
	# could overflow, so handle by hand
	r[-1][0] = __min_int64
	else:
	r[-1][0] = (top - pow_dyad + 1)
	if (top == __max_int64):
	# account for fact that __max_int64 is 1 smaller than 2^63
	r[-1][0] += 1
	newtop = r[-1][0] - 1
	# recurse on left at finer grain.
	r = __find_ranges_recursive(bot, newtop, power-1, r)
	else:
	# we straddle a power of 2
	power_of_2 = pow_dyad*(top/pow_dyad)

	# recurse on right at finer grain
	r = __find_ranges_recursive(bot, power_of_2 - 1, power-1, r)
	# recurse on left at finer grain
	r = __find_ranges_recursive(power_of_2, top, power-1, r)
	return r

	def rangecount(b64sketch, bot, top):
	if b64sketch is None:
	plpy.error("Sketch error: NULL sketch is provided!")
	if bot > top:
	plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
	str(bot), str(top)))

	try:
	return __do_rangecount(base64.b64decode(b64sketch), bot, top)
	except (TypeError, struct.error) as e:
	plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

	def __do_rangecount(all_sketch, bot, top):
	cursum = 0
	rows = [ all_sketch[i__countmin_sz:(i+1)__countmin_sz] for i in range(0,__depth*__ranges) ]
	r = __find_ranges(bot, top)
	# for obscure reasons, len(r) isn't working so use sum to compute
	lenny = sum([1 for i in r])

	# __find_ranges will not generate a span larger than 2^63-1, so
	# we only need to consider 2^63 or less.
	for i in range(0, lenny):
	# What power of 2 is this range?
	if (r[i][0] == __min_int64 and r[i][1] == -1):
	# special case to avoid overflow: full range left of 0
	dyad = 63
	countval = -1
	elif (r[i][0] == 0 and r[i][1] == __max_int64):
	# special case to avoid overflow: full range right of 0
	dyad = 63
	countval = 0
	else:
	width = r[i][1] - r[i][0] + 1
	if (r[i][0] == __min_int64):
	width += 1
	if (r[i][1] == __max_int64):
	width += 1
	# Divide min of range by 2^dyad and get count
	dyad = intlog2(width)
	countval = r[i][0] >> dyad
	val = __do_count_rows(rows[dyad__depth:(dyad+1)__depth], countval)

	cursum += val
	return cursum


	#!
	# find the approximate centile in a cm sketch by binary search in the domain of values
	# \param transval the current transition value
	# \param intcentile the centile to return
	# \param total the total count of items
	def centile(b64sketch, intcentile, total):
	if b64sketch is None:
	plpy.error("Sketch error: NULL sketch is provided!")
	if (intcentile <= 0 or intcentile >= 100):
	plpy.error("Sketch error: invalid centile {0} provided!".format(
	str(intcentile)))
	if total < 0:
	plpy.error("Sketch error: invalid count {0} is provided!".format(
	str(total)))

	try:
	return __do_centile(base64.b64decode(b64sketch), intcentile, total)
	except (TypeError, struct.error) as e:
	plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

	def __do_centile(all_sketches, intcentile, total):
	# est_cnt = __do_rangecount(all_sketches, __min_int64, __max_int64)
	# if total != est_cnt:
	# plpy.error("Sketch error: cnt is {0}, but sketch has {1}".format(
	# str(total), str(est_cnt)))

	centile_cnt = (total * intcentile/100.0);

	loguess = __min_int64
	higuess = __max_int64
	while loguess < higuess:
	curguess = loguess + int((higuess - loguess) / 2)
	curcount = __do_rangecount(all_sketches, __min_int64, curguess)
	if (curcount >= centile_cnt):
	# overshot
	higuess = curguess
	else:
	# undershot
	loguess = curguess + 1
	return loguess


	def width_histogram(b64sketch, themin, themax, buckets):
	if b64sketch is None:
	plpy.error("Sketch error: NULL sketch is provided!")
	if themin > themax:
	plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format(
	str(themin), str(themax)))
	if buckets <= 0:
	plpy.error("Sketch error: negative nbuckets {0} provided!".format(
	str(buckets)))

	try:
	return __do_width_histo(base64.b64decode(b64sketch), themin, themax,
	buckets)
	except (TypeError, struct.error) as e:
	plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

	def __do_width_histo(all_sketches, themin, themax, buckets):
	step = int(float(themax-themin+1) / float(buckets))
	step = 1 if step < 1 else step
	histo = []
	for i in range(0, buckets):
	binlo = themin + i*step;
	if (binlo > themax):
	break
	binhi = themax if (i == buckets-1) else (themin + (i+1)*step - 1)
	binval = __do_rangecount(all_sketches, binlo, binhi)
	histo.append([binlo,binhi,binval])
	return histo

	def depth_histogram(b64sketch, buckets):
	if b64sketch is None:
	plpy.error("Sketch error: NULL sketch is provided!")
	if buckets <= 0:
	plpy.error("Sketch error: negative nbuckets {0} provided!".format(
	str(buckets)))

	try:
	return __do_depth_histo(base64.b64decode(b64sketch), buckets)
	except (TypeError, struct.error) as e:
	plpy.error("Sketch error: wrongly formatted cmsketch object provided!")

	def __do_depth_histo(all_sketches, buckets):
	step = int(100.0 / float(buckets))
	step = 1 if step < 1 else step
	total = __do_rangecount(all_sketches, __min_int64, __max_int64)
	binlo = __min_int64
	histo = []

	for i in range(0, buckets):
	if (i < buckets - 1):
	cent = __do_centile(all_sketches, (i+1)*step, total);
	if (i > 0 and cent <= histo[-1][1]):
	# next centile is lower than previous; skip
	continue;
	histo.append([binlo,cent])
	else:
	# this is the top bucket
	histo.append([binlo, __max_int64])
	histo[-1].append(__do_rangecount(all_sketches,\
	histo[-1][0], histo[-1][1]))
	binlo = histo[-1][1] + 1;
	return histo