blob: c052ca7db90ae7bff0b3e1c3233962c83937ba0c [file] [log] [blame]
/*!
* \file sortasort.c
*
* \brief sortasort dictionary implementation
*/
/*!
* A "sortasort" is a pre-marshalled *set* (no dups) of values intended for
* append and query operations only (no deletion). It's not a
* particularly smart data structure. Cuckoo hashing would be a
* fancier solution.
*/
/*
* It is structured as a header, followed by a fixed-length "directory"
* (an array of offsets) that point to the actual Datums
* concatenated in a variable-length array at the end of the directory.
* The initial directory entries are sorted in ascending order of the Datums
* they point to, but the last < SORTA_SLOP entries are left unsorted to facilitate
* efficient insertion. Binary Search is used on all but those last entries,
* which must be scanned. At every k*SORTA_SLOP'th insert, the full directory is
* sorted.
*/
//#include <ctype.h>
#include <postgres.h>
#include <fmgr.h>
#include "sortasort.h"
#include "sketch_support.h"
#define SORTASORT_DATA(s) (((char *)(s->dir)) + \
(s->capacity * (sizeof (s->dir[0]))))
/*
* get the ith item stored in sortasort
*/
char *sortasort_getval(sortasort *s, unsigned i) {
char *data_ptr = SORTASORT_DATA(s);
char *res = NULL;
Datum dat;
if (i >= s->num_vals) {
elog(ERROR, "attempt to get item at illegal index %d in sortasort", i);
}
if (s->dir[i] >= s->storage_sz) {
elog(ERROR, "illegal offset %u in sortasort", s->dir[i]);
}
res = data_ptr + s->dir[i];
dat = PointerExtractDatum(res, s->typByVal);
if (s->dir[i]
+ ExtractDatumLen(dat, s->typLen, s->typByVal, s->storage_sz - s->dir[i])
> s->storage_sz) {
elog(ERROR, "value overruns size of sortasort");
}
return res;
}
/*!
* given a pre-allocated sortasort, set up its metadata
* first argument s is the
* \param s a pre-allocated sortasort
* \param capacity size of the sortasort directory
* \param s_sz size of s
* \param typLen Postgres type length (-1 for bytea, -2 for cstring)
*/
sortasort *
sortasort_init(sortasort *s,
size_t capacity,
size_t s_sz,
int typLen,
bool typByVal)
{
/* capacity is the size of the directory: i.e. max number of Datums it can hold */
s->capacity = capacity;
/* storage_sz is the number of bytes available for Datums at the end. */
s->storage_sz = s_sz - sizeof(sortasort) - capacity*sizeof(s->dir[0]);
if (s_sz - sizeof(sortasort) <= capacity*sizeof(s->dir[0]))
elog(
ERROR,
"sortasort initialized too small to hold its own directory");
s->typLen = typLen;
s->typByVal = typByVal;
/* number of values so far */
s->num_vals = 0;
/* offset after the directory to do the next insertion */
s->storage_cur = 0;
return(s);
}
/*! comparison function for qsort_arg */
int sorta_cmp(const void *i, const void *j, void *thunk)
{
/* the "thunk" in this case is the sortasort being sorted */
sortasort *s = (sortasort *)thunk;
char *dat1 = sortasort_getval(s, ((unsigned *)i) - s->dir);
char *dat2 = sortasort_getval(s, ((unsigned *)j) - s->dir);
int len = s->typLen;
int shorter;
/*
* we always use typByVal = true, since we've marshalled the data into place
*/
if (len < 0) {
len = (int)ExtractDatumLen(PointerGetDatum(dat1), len, s->typByVal, -1);
if ((shorter = (len - ExtractDatumLen(PointerGetDatum(dat2), len, s->typByVal, -1))))
/* order by length */
return shorter;
/* else drop through */
}
/* byte ordering */
return memcmp(dat1, dat2, len);
}
/*!
* insert a new element into s_in if there's room and return TRUE
* if not enough room, return FALSE
* \param s_in a sortasort
* \param dat a Datum to insert
* \param len the Postgres typLen
*/
int sortasort_try_insert(sortasort *s_in, Datum dat, int len)
{
/* sanity check */
void *datp = DatumExtractPointer(dat, s_in->typByVal);
/* first check to see if the element is already there */
int found = sortasort_find(s_in, dat);
if (found >= 0 && found < (int)s_in->num_vals) {
/* found! just return TRUE */
return true;
}
len = ExtractDatumLen(dat, len, s_in->typByVal, -1);
/* sanity check */
if (found < -1 || found >= (int) s_in->num_vals)
elog(ERROR,
"invalid offset %d returned by sortasort_find",
found);
/* we need to insert v. return FALSE if not enough space. */
if (s_in->storage_cur + len >= s_in->storage_sz) {
/* caller will have to allocate a bigger one and try again */
return false;
}
/* return -1 if no more capacity */
if (s_in->num_vals >= s_in->capacity)
return -1;
/*
* copy dat to the current storage offset, put a pointer into dir,
* update num_vals and storage_cur
*/
memcpy(SORTASORT_DATA(s_in) + s_in->storage_cur,
datp,
len);
s_in->dir[s_in->num_vals++] = s_in->storage_cur;
s_in->storage_cur += len;
if (s_in->storage_cur > s_in->storage_sz)
elog(ERROR, "went off the end of sortasort storage");
/* re-sort every SORTA_SLOP vals */
if (s_in->num_vals % SORTA_SLOP == 0)
qsort_arg(s_in->dir,
s_in->num_vals,
sizeof(s_in->dir[0]),
sorta_cmp,
(void *)s_in);
return true;
}
/*!
* find items in a sortasort. this involves binary search in the sorted prefix,
* and linear search in the <SORTA_SLOP-sized suffix.
* We assume that the sorted prefix is the
* highest multiple of SORTA_SLOP less than s->num_vals
*
* \param s a sortasort
* \param v a value to find
* \return position in directory where item found, or -1 if not found.
* NOTE: return value 0 does not mean false!! It means it *found* the item
* at position 0
*/
int sortasort_find(sortasort *s, Datum dat)
{
int theguess, diff;
int hi = (s->num_vals/SORTA_SLOP)*SORTA_SLOP;
int themin = 0, themax = hi - 1;
size_t i;
int addend, subtrahend;
size_t len = ExtractDatumLen(dat, s->typLen, s->typByVal, -1);
/* binary search on the front of the sortasort */
if (themax >= (int)s->num_vals) {
elog(ERROR,
"sortasort failure: max = %d, num_vals = %zu",
themax,
s->num_vals);
}
theguess = hi / 2;
while (themin < themax ) {
if (!(diff = memcmp(sortasort_getval(s,theguess),
DatumExtractPointer(dat, s->typByVal),
len)))
return theguess;
if (themin == themax - 1) break;
else if (diff < 0) {
/* undershot */
addend = (themax - theguess) / 2;
if (!addend) addend = 1;
themin = theguess;
theguess += addend;
}
else {
/* overshot */
subtrahend = (theguess - themin) / 2;
if (!subtrahend) subtrahend = 1;
themax = theguess;
theguess -= subtrahend;
}
}
/* if we got here, continue with a naive linear search on the tail */
for (i = hi; i < s->num_vals; i++)
if (!memcmp(sortasort_getval(s, i),
DatumExtractPointer(dat, s->typByVal),
len))
return i;
return -1;
}