blob: f2bd6a747ca51be4c497384a64a71f0eb7d7c26c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*--------------------------------------------------------------------------
*
* cdbhash.c
* Provides hashing routines to support consistant data distribution/location within Greenplum Database.
*
*--------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <assert.h>
#include "utils/builtins.h"
#include "catalog/pg_type.h"
#include "parser/parse_type.h"
#include "utils/numeric.h"
#include "utils/inet.h"
#include "utils/timestamp.h"
#include "utils/array.h"
#include "utils/date.h"
#include "utils/cash.h"
#include "utils/datetime.h"
#include "utils/nabstime.h"
#include "utils/varbit.h"
#include "utils/acl.h"
#include "fmgr.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "cdb/cdbhash.h"
#include "cdb/cdbutil.h"
#include "cdb/cdbvars.h"
/*
* 32 bit FNV-1 and FNV-1a non-zero initial basis
* NOTE: The FNV-1a initial basis is the same value as FNV-1 by definition.
*/
#define FNV1_32_INIT ((uint32)0x811c9dc5)
#define FNV1_32A_INIT FNV1_32_INIT
/* Constant prime value used for an FNV1/FNV1A hash */
#define FNV_32_PRIME ((uint32)0x01000193)
/* Constant used for hashing a NULL value */
#define NULL_VAL ((uint32)0XF0F0F0F1)
/* Constant used for hashing a NAN value */
#define NAN_VAL ((uint32)0XE0E0E0E1)
/* Constant used for hashing an invalid value */
#define INVALID_VAL ((uint32)0XD0D0D0D1)
/* Constant used to help defining upper limit for random generator */
#define UPPER_VAL ((uint32)0XA0B0C0D1)
/* Fast mod using a bit mask, assuming that y is a power of 2 */
#define FASTMOD(x,y) ((x) & ((y)-1))
/* local function declarations */
uint32 fnv1_32_buf(void *buf, size_t len, uint32 hashval);
uint32 fnv1a_32_buf(void *buf, size_t len, uint32 hashval);
int inet_getkey(inet *addr, unsigned char *inet_key, int key_size);
int ignoreblanks(char *data, int len);
int ispowof2(int numsegs);
/*================================================================
*
* HASH API FUNCTIONS
*
*================================================================
*/
/*
* Create a CdbHash for this session.
*
* CdbHash maintains the following information about the hash.
* In here we set the variables that should not change in the scope of the newly created
* CdbHash, these are:
*
* 1 - number of segments in Greenplum Database.
* 2 - hashingalgorithm used.
* 3 - reduction method.
*
* The hash value itself will be initialized for every tuple in cdbhashinit()
*/
CdbHash *
makeCdbHash(int numsegs, CdbHashAlg algorithm)
{
CdbHash *h;
assert(numsegs > 0); /* verify number of segments is legal. */
assert(algorithm == HASH_FNV_1); /* make sure everybody uses same algorithm */
/* Create a pointer to a CdbHash that includes the hash properties */
h = palloc(sizeof(CdbHash));
/*
* set this hash session characteristics.
*/
h->hash = 0;
h->numsegs = numsegs;
h->hashalg = algorithm;
if (h->hashalg == HASH_FNV_1)
h->hashfn = &fnv1_32_buf;
else if (h->hashalg == HASH_FNV_1A)
h->hashfn = &fnv1a_32_buf;
/*
* set the reduction algorithm: If num_segs is power of 2 use bit mask,
* else use lazy mod (h mod n)
*/
if (ispowof2(numsegs))
{
h->reducealg = REDUCE_BITMASK;
}
else
{
h->reducealg = REDUCE_LAZYMOD;
}
/*
* if we distribute into a relation with an empty partitioning policy,
* we will round robin the tuples starting off from this index. Note that
* the random number is created one per makeCdbHash. This means that commands
* that create a cdbhash object only once for all tuples (like COPY,
* INSERT-INTO-SELECT) behave more like a round-robin distribution, while
* commands that create a cdbhash per row (like INSERT) behave more like a
* random distribution.
*/
h->rrindex = cdb_randint(0, UPPER_VAL);
ereport(DEBUG4,
(errmsg("CDBHASH started using algorithm %d into %d segment databases",
h->hashalg,
h->numsegs)));
return h;
}
/*
* Initialize CdbHash for hashing the next tuple values.
*/
void
cdbhashinit(CdbHash *h)
{
/* reset the hash value to the initial offset basis */
h->hash = FNV1_32_INIT;
}
/**
* Implements datumHashFunction
*/
static void
addToCdbHash(void *cdbHash, void *buf, size_t len)
{
CdbHash *h = (CdbHash*)cdbHash;
h->hash = (h->hashfn) (buf, len, h->hash);
}
extern void varattrib_untoast_ptr_len(Datum d, char **datastart, int *len, void **tofree);
/*
* Add an attribute to the CdbHash calculation.
*/
void
cdbhash(CdbHash *h, Datum datum, Oid type)
{
hashDatum(datum, type, addToCdbHash, (void*)h);
}
/*
* Add an attribute to the hash calculation.
* **IMPORTANT: any new hard coded support for a data type in here
* must be added to isGreenplumDbHashable() below!
*
* Note that the caller should provide the base type if the datum is
* of a domain type. It is quite expensive to call get_typtype() and
* getBaseType() here since this function gets called a lot for the
* same set of Datums.
*
* @param hashFn called to update the hash value.
* @param clientData passed to hashFn.
*/
void
hashDatum(Datum datum, Oid type, datumHashFunction hashFn, void *clientData)
{
void *buf = NULL; /* pointer to the data */
size_t len = 0; /* length for the data buffer */
int64 intbuf; /* an 8 byte buffer for all integer sizes */
float4 buf_f4;
float8 buf_f8;
Timestamp tsbuf; /* timestamp data dype is either a double or
* int8 (determined in compile time) */
TimestampTz tstzbuf;
DateADT datebuf;
TimeADT timebuf;
TimeTzADT *timetzptr;
Interval *intervalptr;
AbsoluteTime abstime_buf;
RelativeTime reltime_buf;
TimeInterval tinterval;
AbsoluteTime tinterval_len;
Numeric num;
bool bool_buf;
char char_buf;
Name namebuf;
ArrayType *arrbuf;
inet *inetptr; /* inet/cidr */
unsigned char inet_hkey[sizeof(inet_struct)];
macaddr *macptr; /* MAC address */
VarBit *vbitptr;
int2vector *i2vec_buf;
oidvector *oidvec_buf;
Cash cash_buf;
AclItem *aclitem_ptr;
uint32 aclitem_buf;
/*
* special case buffers
*/
uint32 nanbuf;
uint32 invalidbuf;
void *tofree = NULL;
/*
* Select the hash to be performed according to the field type we are adding to the
* hash.
*/
switch (type)
{
/*
* ======= NUMERIC TYPES ========
*/
case INT2OID: /* -32 thousand to 32 thousand, 2-byte storage */
intbuf = (int64) DatumGetInt16(datum); /* cast to 8 byte before
* hashing */
buf = &intbuf;
len = sizeof(intbuf);
break;
case INT4OID: /* -2 billion to 2 billion integer, 4-byte
* storage */
intbuf = (int64) DatumGetInt32(datum); /* cast to 8 byte before
* hashing */
buf = &intbuf;
len = sizeof(intbuf);
break;
case INT8OID: /* ~18 digit integer, 8-byte storage */
intbuf = DatumGetInt64(datum); /* cast to 8 byte before
* hashing */
buf = &intbuf;
len = sizeof(intbuf);
break;
case FLOAT4OID: /* single-precision floating point number,
* 4-byte storage */
buf_f4 = DatumGetFloat4(datum);
/*
* On IEEE-float machines, minus zero and zero have different bit
* patterns but should compare as equal. We must ensure that they
* have the same hash value, which is most easily done this way:
*/
if (buf_f4 == (float4) 0)
buf_f4 = 0.0;
buf = &buf_f4;
len = sizeof(buf_f4);
break;
case FLOAT8OID: /* double-precision floating point number,
* 8-byte storage */
buf_f8 = DatumGetFloat8(datum);
/*
* On IEEE-float machines, minus zero and zero have different bit
* patterns but should compare as equal. We must ensure that they
* have the same hash value, which is most easily done this way:
*/
if (buf_f8 == (float8) 0)
buf_f8 = 0.0;
buf = &buf_f8;
len = sizeof(buf_f8);
break;
case NUMERICOID:
num = DatumGetNumeric(datum);
if (NUMERIC_IS_NAN(num))
{
nanbuf = NAN_VAL;
buf = &nanbuf;
len = sizeof(nanbuf);
}
else
/* not a nan */
{
buf = num->n_data;
len = (VARSIZE(num) - NUMERIC_HDRSZ);
}
/*
* If we did a pg_detoast_datum, we need to remember to pfree,
* or we will leak memory. Because of the 1-byte varlena header stuff.
*/
if (num != DatumGetPointer(datum))
tofree = num;
break;
/*
* ====== CHARACTER TYPES =======
*/
case CHAROID: /* char(1), single character */
char_buf = DatumGetChar(datum);
buf = &char_buf;
len = 1;
break;
case BPCHAROID: /* char(n), blank-padded string, fixed storage */
case TEXTOID: /* text */
case VARCHAROID: /* varchar */
case BYTEAOID: /* bytea */
{
int tmplen;
varattrib_untoast_ptr_len(datum, (char **) &buf, &tmplen, &tofree);
/* adjust length to not include trailing blanks */
if (type != BYTEAOID && tmplen > 1)
tmplen = ignoreblanks((char *) buf, tmplen);
len = tmplen;
break;
}
case NAMEOID:
namebuf = DatumGetName(datum);
len = NAMEDATALEN;
buf = NameStr(*namebuf);
/* adjust length to not include trailing blanks */
if (len > 1)
len = ignoreblanks((char *) buf, len);
break;
/*
* ====== OBJECT IDENTIFIER TYPES ======
*/
case OIDOID: /* object identifier(oid), maximum 4 billion */
case REGPROCOID: /* function name */
case REGPROCEDUREOID: /* function name with argument types */
case REGOPEROID: /* operator name */
case REGOPERATOROID: /* operator with argument types */
case REGCLASSOID: /* relation name */
case REGTYPEOID: /* data type name */
intbuf = (int64) DatumGetUInt32(datum); /* cast to 8 byte before hashing */
buf = &intbuf;
len = sizeof(intbuf);
break;
case TIDOID: /* tuple id (6 bytes) */
buf = DatumGetPointer(datum);
len = SizeOfIptrData;
break;
/*
* ====== DATE/TIME TYPES ======
*/
case TIMESTAMPOID: /* date and time */
tsbuf = DatumGetTimestamp(datum);
buf = &tsbuf;
len = sizeof(tsbuf);
break;
case TIMESTAMPTZOID: /* date and time with time zone */
tstzbuf = DatumGetTimestampTz(datum);
buf = &tstzbuf;
len = sizeof(tstzbuf);
break;
case DATEOID: /* ANSI SQL date */
datebuf = DatumGetDateADT(datum);
buf = &datebuf;
len = sizeof(datebuf);
break;
case TIMEOID: /* hh:mm:ss, ANSI SQL time */
timebuf = DatumGetTimeADT(datum);
buf = &timebuf;
len = sizeof(timebuf);
break;
case TIMETZOID: /* time with time zone */
/*
* will not compare to TIMEOID on equal values.
* Postgres never attempts to compare the two as well.
*/
timetzptr = DatumGetTimeTzADTP(datum);
buf = (unsigned char *) timetzptr;
/*
* Specify hash length as sizeof(double) + sizeof(int4), not as
* sizeof(TimeTzADT), so that any garbage pad bytes in the structure
* won't be included in the hash!
*/
len = sizeof(timetzptr->time) + sizeof(timetzptr->zone);
break;
case INTERVALOID: /* @ <number> <units>, time interval */
intervalptr = DatumGetIntervalP(datum);
buf = (unsigned char *) intervalptr;
/*
* Specify hash length as sizeof(double) + sizeof(int4), not as
* sizeof(Interval), so that any garbage pad bytes in the structure
* won't be included in the hash!
*/
len = sizeof(intervalptr->time) + sizeof(intervalptr->month);
break;
case ABSTIMEOID:
abstime_buf = DatumGetAbsoluteTime(datum);
if (abstime_buf == INVALID_ABSTIME)
{
/* hash to a constant value */
invalidbuf = INVALID_VAL;
len = sizeof(invalidbuf);
buf = &invalidbuf;
}
else
{
len = sizeof(abstime_buf);
buf = &abstime_buf;
}
break;
case RELTIMEOID:
reltime_buf = DatumGetRelativeTime(datum);
if (reltime_buf == INVALID_RELTIME)
{
/* hash to a constant value */
invalidbuf = INVALID_VAL;
len = sizeof(invalidbuf);
buf = &invalidbuf;
}
else
{
len = sizeof(reltime_buf);
buf = &reltime_buf;
}
break;
case TINTERVALOID:
tinterval = DatumGetTimeInterval(datum);
/*
* check if a valid interval. the '0' status code
* stands for T_INTERVAL_INVAL which is defined in
* nabstime.c. We use the actual value instead
* of defining it again here.
*/
if(tinterval->status == 0 ||
tinterval->data[0] == INVALID_ABSTIME ||
tinterval->data[1] == INVALID_ABSTIME)
{
/* hash to a constant value */
invalidbuf = INVALID_VAL;
len = sizeof(invalidbuf);
buf = &invalidbuf;
}
else
{
/* normalize on length of the time interval */
tinterval_len = tinterval->data[1] - tinterval->data[0];
len = sizeof(tinterval_len);
buf = &tinterval_len;
}
break;
/*
* ======= NETWORK TYPES ========
*/
case INETOID:
case CIDROID:
inetptr = DatumGetInetP(datum);
len = inet_getkey(inetptr, inet_hkey, sizeof(inet_hkey)); /* fill-in inet_key & get len */
buf = inet_hkey;
break;
case MACADDROID:
macptr = DatumGetMacaddrP(datum);
len = sizeof(macaddr);
buf = (unsigned char *) macptr;
break;
/*
* ======== BIT STRINGS ========
*/
case BITOID:
case VARBITOID:
/*
* Note that these are essentially strings.
* we don't need to worry about '10' and '010'
* to compare, b/c they will not, by design.
* (see SQL standard, and varbit.c)
*/
vbitptr = DatumGetVarBitP(datum);
len = VARBITBYTES(vbitptr);
buf = (char *) VARBITS(vbitptr);
break;
/*
* ======= other types =======
*/
case BOOLOID: /* boolean, 'true'/'false' */
bool_buf = DatumGetBool(datum);
buf = &bool_buf;
len = sizeof(bool_buf);
break;
/*
* We prepare the hash key for aclitems just like postgresql does.
* (see code and comment in acl.c: hash_aclitem() ).
*/
case ACLITEMOID:
aclitem_ptr = DatumGetAclItemP(datum);
aclitem_buf = (uint32) (aclitem_ptr->ai_privs + aclitem_ptr->ai_grantee + aclitem_ptr->ai_grantor);
buf = &aclitem_buf;
len = sizeof(aclitem_buf);
break;
/*
* ANYARRAY is a pseudo-type. We use it to include
* any of the array types (OIDs 1007-1033 in pg_type.h).
* caller needs to be sure the type is ANYARRAYOID
* before calling cdbhash on an array (INSERT and COPY do so).
*/
case ANYARRAYOID:
arrbuf = DatumGetArrayTypeP(datum);
len = VARSIZE(arrbuf) - VARHDRSZ;
buf = VARDATA(arrbuf);
break;
case INT2VECTOROID:
i2vec_buf = (int2vector *) DatumGetPointer(datum);
len = i2vec_buf->dim1 * sizeof(int2);
buf = (void *)i2vec_buf->values;
break;
case OIDVECTOROID:
oidvec_buf = (oidvector *) DatumGetPointer(datum);
len = oidvec_buf->dim1 * sizeof(Oid);
buf = oidvec_buf->values;
break;
case CASHOID: /* cash is stored in int32 internally */
cash_buf = (* (Cash *)DatumGetPointer(datum));
len = sizeof(Cash);
buf = &cash_buf;
break;
default:
ereport(ERROR,
(errcode(ERRCODE_CDB_FEATURE_NOT_YET),
errmsg("Type %u is not hashable.", type)));
} /* switch(type) */
/* do the hash using the selected algorithm */
hashFn(clientData, buf, len);
if(tofree)
pfree(tofree);
}
/*
* Add a NULL attribute to the hash calculation.
*/
void
cdbhashnull(CdbHash *h)
{
hashNullDatum(addToCdbHash, (void*)h);
}
/**
* Update the hash value for a null Datum
*
* @param hashFn called to update the hash value.
* @param clientData passed to hashFn.
*/
void
hashNullDatum(datumHashFunction hashFn, void *clientData)
{
uint32 nullbuf = NULL_VAL; /* stores the constant value that
* represents a NULL */
void *buf = &nullbuf; /* stores the address of the buffer */
size_t len = sizeof(nullbuf); /* length of the value */
hashFn(clientData, buf, len);
}
/*
* Hash a tuple of a relation with an empty policy (no hash
* key exists) via round robin with a random initial value.
*/
void
cdbhashnokey(CdbHash *h)
{
uint32 rrbuf = h->rrindex;
void *buf = &rrbuf;
size_t len = sizeof(rrbuf);
/* do the hash using the selected algorithm */
h->hash = (h->hashfn) (buf, len, h->hash);
h->rrindex++; /* increment for next time around */
}
/*
* Reduce the hash to a segment number.
*/
unsigned int
cdbhashreduce(CdbHash *h)
{
int result = 0; /* TODO: what is a good initialization value?
* could we guarantee at this point that there
* will not be a negative segid in Greenplum Database and
* therefore initialize to this value for
* error checking? */
assert(h->reducealg == REDUCE_BITMASK || h->reducealg == REDUCE_LAZYMOD);
/*
* Reduce our 32-bit hash value to a segment number
*/
switch (h->reducealg)
{
case REDUCE_BITMASK:
result = FASTMOD(h->hash, (uint32) h->numsegs); /* fast mod (bitmask) */
break;
case REDUCE_LAZYMOD:
result = (h->hash) % (h->numsegs); /* simple mod */
break;
}
return result;
}
bool
typeIsArrayType(Oid typeoid)
{
Type tup = typeidType(typeoid);
Form_pg_type typeform;
bool res = false;
typeform = (Form_pg_type)GETSTRUCT(tup);
if (typeform->typelem != InvalidOid &&
typeform->typtype != 'd' &&
NameStr(typeform->typname)[0] == '_' &&
typeform->typinput == F_ARRAY_IN)
res = true;
ReleaseType(tup);
return res;
}
bool isGreenplumDbHashable(Oid typid)
{
/* we can hash all arrays */
if (typeIsArrayType(typid))
return true;
/*
* if this type is a domain type, get its base type.
*/
if (get_typtype(typid) == 'd')
typid = getBaseType(typid);
switch(typid)
{
case INT2OID:
case INT4OID:
case INT8OID:
case FLOAT4OID:
case FLOAT8OID:
case NUMERICOID:
case CHAROID:
case BPCHAROID:
case TEXTOID:
case VARCHAROID:
case BYTEAOID:
case NAMEOID:
case OIDOID:
case TIDOID:
case REGPROCOID:
case REGPROCEDUREOID:
case REGOPEROID:
case REGOPERATOROID:
case REGCLASSOID:
case REGTYPEOID:
case TIMESTAMPOID:
case TIMESTAMPTZOID:
case DATEOID:
case TIMEOID:
case TIMETZOID:
case INTERVALOID:
case ABSTIMEOID:
case RELTIMEOID:
case TINTERVALOID:
case INETOID:
case CIDROID:
case MACADDROID:
case BITOID:
case VARBITOID:
case BOOLOID:
case ACLITEMOID:
case ANYARRAYOID:
case INT2VECTOROID:
case OIDVECTOROID:
case CASHOID:
return true;
default:
return false;
}
}
/*
* fnv1_32_buf - perform a 32 bit FNV 1 hash on a buffer
*
* input:
* buf - start of buffer to hash
* len - length of buffer in octets (bytes)
* hval - previous hash value or FNV1_32_INIT if first call.
*
* returns:
* 32 bit hash as a static hash type
*/
uint32
fnv1_32_buf(void *buf, size_t len, uint32 hval)
{
unsigned char *bp = (unsigned char *) buf; /* start of buffer */
unsigned char *be = bp + len; /* beyond end of buffer */
/*
* FNV-1 hash each octet in the buffer
*/
while (bp < be)
{
/* multiply by the 32 bit FNV magic prime mod 2^32 */
#if defined(NO_FNV_GCC_OPTIMIZATION)
hval *= FNV_32_PRIME;
#else
hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
#endif
/* xor the bottom with the current octet */
hval ^= (uint32) *bp++;
}
/* return our new hash value */
return hval;
}
/*
* fnv1a_32_buf - perform a 32 bit FNV 1A hash on a buffer
*
* input:
* buf - start of buffer to hash
* len - length of buffer in octets (bytes)
* hval - previous hash value or FNV1_32_INIT if first call.
*
* returns:
* 32 bit hash as a static hash type
*/
uint32
fnv1a_32_buf(void *buf, size_t len, uint32 hval)
{
unsigned char *bp = (unsigned char *) buf; /* start of buffer */
unsigned char *be = bp + len; /* beyond end of buffer */
/*
* FNV-1 hash each octet in the buffer
*/
while (bp < be)
{
/* xor the bottom with the current octet */
hval ^= (uint32) *bp++;
/* multiply by the 32 bit FNV magic prime mod 2^32 */
#if defined(NO_FNV_GCC_OPTIMIZATION)
hval *= FNV_32_PRIME;
#else
hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24);
#endif
}
/* return our new hash value */
return hval;
}
/*
* Support function for hashing on inet/cidr (see network.c)
*
* Since network_cmp considers only ip_family, ip_bits, and ip_addr,
* only these fields may be used in the hash; in particular don't use type.
*/
int inet_getkey(inet *addr, unsigned char *inet_key, int key_size)
{
int addrsize;
switch (((inet_struct *)VARDATA_ANY(addr))->family)
{
case PGSQL_AF_INET:
addrsize = 4;
break;
case PGSQL_AF_INET6:
addrsize = 16;
break;
default:
addrsize = 0;
}
Assert(addrsize + 2 <= key_size);
inet_key[0] = ((inet_struct *)VARDATA_ANY(addr))->family;
inet_key[1] = ((inet_struct *)VARDATA_ANY(addr))->bits;
memcpy(inet_key + 2, ((inet_struct *)VARDATA_ANY(addr))->ipaddr, addrsize);
return (addrsize + 2);
}
/*================================================================
*
* GENERAL PURPOSE UTILS
*
*================================================================
*/
/*
* Given the original length of the data array this function is
* recalculating the length after ignoring any trailing blanks. The
* actual data is remained unmodified.
*/
int
ignoreblanks(char *data, int len)
{
/* look for trailing blanks and skip them in the hash calculation */
while (data[len - 1] == ' ')
{
len--;
if (len == 1) /* if only 1 char is left, leave it alone!
* (the string is either empty or has 1 char) */
break;
}
return len;
}
/*
* returns 1 is the input int is a power of 2 and 0 otherwize.
*/
int
ispowof2(int numsegs)
{
return !(numsegs & (numsegs - 1));
}