blob: 9e8fed0c1a2cc28904083b56a6267a27e689cd3f [file] [log] [blame]
// -*-C++-*-
// *********************************************************************
//
// File: hash_table.cpp
// Description:
//
//
// Created: 7/10/95
// Language: C++
//
//
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
// *********************************************************************
#include "hash_table.h"
void HashRow::print(ULng32 rowlength) {
printf("\tHashValue: %6d\n\tData: ", hashValue_);
unsigned char * data = (unsigned char *)this;
ULng32 i = 0;
for (; i < rowlength; i++)
printf("%02x", data[i]);
printf("\n\t : ");
for (i = 0; i < rowlength; i++)
printf("%2c", ((data[i] >= ' ' && data[i] < 127) ? data[i] : '?'));
printf("\n\t ");
for (i = 0; i < rowlength; i++)
if (i % 10)
printf (" ");
else
printf ("%1d0", i/10);
printf ("\n");
}
HashTableCursor::HashTableCursor() {
init();
}
HashTableCursor::~HashTableCursor() {
}
HashTableHeader::HashTableHeader() {
init();
}
void HashTableHeader::init() {
// rowCount_ = 0;
row_ = NULL;
}
HashTable::HashTable(ULng32 headerCount,
NABoolean evenFactor,
ULng32 primeFactor,
NABoolean noHVDups,
NABoolean doResize
)
: headerCount_(headerCount > MAX_HEADER_COUNT ?
MAX_HEADER_COUNT : headerCount < MIN_HEADER_COUNT ?
MIN_HEADER_COUNT : headerCount),
rowCount_(0),
header_(NULL),
noHashValueDups_(noHVDups),
resizeThreshold_(UINT_MAX), // default - don't resize
originalSize_(TRUE), // this HT was not resized yet
hashTableOverflow_(FALSE),
singleChainLastRow_(NULL) {
#ifdef _DEBUG
if ( getenv("ALLOW_HV_DUPS" ) ) noHashValueDups_ = FALSE; // for testing
#endif
headerCount_ *= 2; // the first iteration below would divide it back
// allocate the hash table headers. If we can't allocate it,
// try a smaller table. If we can't even allocate a table with
// MIN_HEADER_COUNT or somewhat more, we die!
while (!header_) {
headerCount_ /= 2;
if ( headerCount_ < MIN_HEADER_COUNT ) return; // allocation failed
// Ensure that headerCount_ has no common prime factors with #buckets
if ( evenFactor && 0 == headerCount_ % 2 )
headerCount_++ ; // the count is even no more
if ( 0 == headerCount_ % primeFactor )
headerCount_ += 2 ; // break the common factor, since primeFactor > 2
header_ = (HashTableHeader *)collHeap()->allocateMemory((UInt32)
(headerCount_ * sizeof(HashTableHeader)), FALSE);
}
if ( doResize )
resizeThreshold_ = (headerCount_ / 100) * HT_LOAD_FACTOR_PERCENT ;
init(); // set all entries in the hash table to NULL
}
HashTable::~HashTable() {
if (header_) {
collHeap()->deallocateMemory((void*)header_);
header_ = NULL;
}
}
void HashTable::init() {
// memset is about six times faster than this loop
// for (unsigned long i = 0; i < headerCount_; i++)
// header_[i].init();
memset( header_ , 0, headerCount_ * sizeof(HashTableHeader) );
}
//////////////////////////////////////////////////////////////////////
// Chains a row into the appropriate hash table chain. It is
// inserted after the first matching row or at the beginning
// of the hash chain, if a matching row is not found.
//////////////////////////////////////////////////////////////////////
void HashTable::insert(atp_struct * workAtp,
HashRow * newRow,
tupp &workAtpTupp1,
tupp &workAtpTupp2,
ex_expr * searchExpr) {
ex_assert( searchExpr,
"This insert() should only be used for searchable entries. ");
// Calculate the simple hash value for the new row
SimpleHashValue newRowHV = newRow->hashValue();
// Determine the hash chain and get the first entry in the chain.
ULng32 chainIdx = newRowHV % headerCount_;
ULng32 origChainIdx = chainIdx;
HashRow *row = header_[chainIdx].row_;
// Boolean used to control when the data pointer for tupp1 should be set
NABoolean isTupp1Set = FALSE;
while (row != NULL) {
if (newRowHV == row->hashValue()) {
// The expression does not expect the HashRow to be part of the row.
// Adjust the datapointer in the work atp to point beyond the HashRow.
workAtpTupp2.setDataPointer(row->getData());
// Set the data in the workAtp for the first index if not set already
if (!isTupp1Set) {
// The expression does not expect the HashRow to be part of the row.
// Adjust the datapointer in the work atp to point beyond the HashRow.
workAtpTupp1.setDataPointer(newRow->getData());
isTupp1Set = TRUE;
}
if ( searchExpr->eval(workAtp, workAtp) == ex_expr::EXPR_TRUE )
break; // Break from while loop.
else if ( noHashValueDups_ ) {
// Different key-value, but with the same HV, try the next HT entry
chainIdx = ++chainIdx % headerCount_; // next HT entry
if ( (chainIdx + 1) % headerCount_ == origChainIdx ) {
// Oops - an overflow -- more diff key-value sub-chains (with same HV)
// than the size of the hash table
// (extremely rare - practically impossible -- requires a very
// small HT (i.e. memory pressure) and a very bad hash function.)
// After an overflow - revert to the other method -- allow dups
// (The "+ 1" above ensures at least one HT entry without this HV
// to prevent an infinite loop in position() )
noHashValueDups_ = FALSE; // stop disallowing dups
chainIdx = origChainIdx; // return to the original chain
hashTableOverflow_ = TRUE; // remember that the HT still contains
// "re-placed" sub-chains from before the overflow; they should
// be searched for in the position() method
}
row = header_[chainIdx].row_; // first row in next chain
continue; // loop again
}
}
row = row->next();
}
// If the row was not found, insert at the beginning of the chain.
// Otherwise, insert after the first match.
if (row == NULL) {
newRow->setNext(header_[chainIdx].row_);
header_[chainIdx].row_ = newRow;
}
else {
newRow->setNext(row->next());
row->setNext(newRow);
}
// one more row in the chain
// header_[chainIdx].rowCount_++;
// one more row in the hash table
rowCount_++;
}
///////////////////////////////////////////////////////////////////
// chain a row unconditionally into the hash table. Don't look
// for matches.
///////////////////////////////////////////////////////////////////
NABoolean HashTable::insert(HashRow * newRow) {
// determine the hash chain
ULng32 chainIdx = newRow->hashValue() % headerCount_;
// Insert at the beginning of the chain
newRow->setNext(header_[chainIdx].row_);
header_[chainIdx].row_ = newRow;
// one more row in the hash table
return ++rowCount_ > resizeThreshold_ ;
}
////////////////////////////////////////////////////////////////////////
// Resize the hash-table (HGB only; no duplicate entries; don't care order)
// (if we can't allocate a new hash-table then we stop resizing; reason is
// that we'd have to overflow anyway soon, and likely flush this cluster.)
///////////////////////////////////////////////////////////////////////
ULng32 HashTable::resize(NABoolean enoughMemory) {
if ( ! enoughMemory ) {
resizeThreshold_ = UINT_MAX ; // stop resizing
return 0;
}
ULng32 newSize = resizeTo() ; // the size for the new HT
HashTableHeader * newHeader =
(HashTableHeader *)
collHeap()->allocateMemory((UInt32)
(newSize * sizeof(HashTableHeader)), FALSE);
if ( ! newHeader ) {
resizeThreshold_ = UINT_MAX ; // stop resizing
return 0;
}
// initialize new HT
memset( newHeader , 0, newSize * sizeof(HashTableHeader) );
// insert entries from the old HT into the new HT
for ( ULng32 oldind = 0; oldind < headerCount_; oldind++ ) {
HashRow *thisRow = header_[oldind].row_ ;
// traverse the old chain; reassign each entry
while ( thisRow ) {
HashRow * nextRow = thisRow->next_ ;
// determine the new hash chain
ULng32 chainIdx = thisRow->hashValue() % newSize;
// Insert at the beginning of the chain in the new HT
thisRow->setNext(newHeader[chainIdx].row_);
newHeader[chainIdx].row_ = thisRow;
thisRow = nextRow ;
}
}
ULng32 memAdded =
( newSize - headerCount_ ) * sizeof(HashTableHeader) ;
// remove old one and replace old fields
collHeap()->deallocateMemory((void*)header_);
header_ = newHeader ;
headerCount_ = newSize ;
resizeThreshold_ = HT_LOAD_FACTOR_PERCENT * (headerCount_ / 100) ;
originalSize_ = FALSE; // this HT was resized
return memAdded ;
}
// HashTable::insertUniq()
// An insert method used by UniqueHashJoin.
// Assumes input is unique and so does not need to check for
// duplicates.
// Entries are inserted in HashValue order.
//
void HashTable::insertUniq(HashRow *newRow)
{
SimpleHashValue newRowHashValue = newRow->hashValueRaw();
// Get the first entry for this chain
//
ULng32 chainIdx = newRowHashValue % headerCount_;
HashRow *row = header_[chainIdx].row_;
// Entries are inserted in HashValue order
// If this entry belongs at the beginning of the chain
// insert it there.
//
if(!row || newRowHashValue <= row->hashValueRaw()) {
newRow->setNext(row);
header_[chainIdx].row_ = newRow;
return;
}
// Cursor so we can insert the new row in the proper position.
//
HashRow *prevRow = row;
row = row->next();
// Find the position in the chain where this hash value belongs
//
while (row && newRowHashValue > row->hashValueRaw()) {
prevRow = row;
row = row->next();
}
// Insert this entry between this row and the previous row
//
newRow->setNext(row);
prevRow->setNext(newRow);
return;
}
///////////////////////////////////////////////////////////////////
// Add a row to the end of the first hash header. The purpose
// of this function is to speed up cross product joins that don't
// have a search predicate and to keep the rows in order by
// placing new rows at the tail. This function is only used by
// cross-product.
///////////////////////////////////////////////////////////////////
void HashTable::insertSingleChain(HashRow * newRow) {
if (header_[0].row_ == NULL)
header_[0].row_ = newRow;
else
singleChainLastRow_->next_ = newRow;
singleChainLastRow_ = newRow;
newRow->setNext(NULL);
// one more row in the hash table
rowCount_++;
}
///////////////////////////////////////////////////////////////
// convert the rowCounts in the HashTableHeader into offsets
// to allow addressing of the bitmap for right outer joins.
// This feature is disabled for now
///////////////////////////////////////////////////////////////
void HashTable::convertToOffsets() {
/*
unsigned long offset = 0;
unsigned long rowCount = 0;
for (unsigned long i = 0; i < headerCount_; i++) {
rowCount += header_[headerCount_].rowCount_;
header_[headerCount_].rowCount_ = offset;
offset = rowCount;
}
*/
}
///////////////////////////////////////////////////////////////
// RETURNS: the next row for the cursor, if it exists.
// NULL(0), if all rows have been returned.
///////////////////////////////////////////////////////////////
HashRow * HashTable::getNext(HashTableCursor * cursor) {
HashRow * row = cursor->currRow_;
if (!row)
// we reached the end of this cursor
return (HashRow *) NULL;
if (cursor->currRow_ != cursor->endRow_) {
cursor->currRow_ = row->next();
if (cursor->currRow_ == NULL) {
// end of chain, if there is another chain, switch to it
cursor->currHeader_++;
while ((cursor->currHeader_ < headerCount_) &&
//skip all empty chains
(!(header_[cursor->currHeader_].row_)))
cursor->currHeader_++;
if (cursor->currHeader_ < headerCount_)
cursor->currRow_ = header_[cursor->currHeader_].row_;
}
}
else
// end of cursor, set currRow_ NULL, such that with the next call
// we return NULL
cursor->currRow_ = NULL;
return row;
}
/////////////////////////////////////////////////////////////////////
// finds the first matching entry and remembers it.
// finds the last matching entry and remembers it.
// They will be used later to return them to the caller by
// calling getNext().
/////////////////////////////////////////////////////////////////////
void HashTable::position(HashTableCursor * cursor,
atp_struct * rowAtp,
atp_struct * workAtp,
short hashTableRowAtpIndex,
ex_expr * searchExpr,
SimpleHashValue hashValue,
NABoolean noDupChaining,
NABoolean returnOrdered) {
ex_assert( searchExpr,
"This position() should only be used for searchable entries. ");
// Get the tuple used for the search expression and get the chain index.
tupp &workAtpTupp = workAtp->getTupp(hashTableRowAtpIndex);
cursor->currHeader_ = hashValue % headerCount_;
// When no HV dups, and when we need to skip to the next Hash Table entry,
// the dummyRow is logically positioned one behind the first row in that
HashRow dummyRow; // next HT entry to fix the for-loop's "row = row->next_"
// This loop searches for a first matching row. If one is found,
// the last matching row is found within the "if" condition in
// this loop.
for (HashRow *row = header_[cursor->currHeader_].row_;
row != NULL;
row = row->next()) {
// If the simple hash doesn't match, then skip to the next row.
if (hashValue != row->hashValue())
continue;
// Set the data pointer in the tuple to point to this row.
// The expression does not expect the HashRow to be part of the row.
// Adjust the datapointer in the work atp to point beyond the HashRow.
workAtpTupp.setDataPointer(row->getData());
// If this row matches, then set the cursor to the current row,
// search for the last matching row and return.
if ( searchExpr->eval(rowAtp, workAtp) == ex_expr::EXPR_TRUE ) {
ULng32 matchCount = 1; // num matched so far
// Set the row pointers in the cursor to the first matching row
cursor->beginRow_ = cursor->endRow_ = cursor->currRow_ = row;
// Sometimes (e.g. for some (anti)semi-joins) we don't need more than
// one match and if there are many dups, finding the end of the
// sub-chain has a severe performance penalty -- see solution
// 10-080320-1596.
if (noDupChaining)
return;
// Start searching on the next row for the last matching row
for (row = row->next(); row != NULL; row = row->next()) {
if (row->hashValue() != hashValue)
break; // different HV, search no more (key-value must be different)
if ( ! noHashValueDups_ ) {
// the next HV is the same - check if the key-value is different
// The expression does not expect the HashRow to be part of the row.
// Adjust the datapointer in the work atp to point beyond the HashRow.
workAtpTupp.setDataPointer(row->getData());
if ( searchExpr->eval(rowAtp, workAtp) != ex_expr::EXPR_TRUE )
break; // different key-value - end of this sub-chain
}
// Found one more row for this sub-chain
// Set the end row to this matching row.
cursor->endRow_ = row;
matchCount++;
}
// At this point the cursor is pointing to the sub-chain of matched rows
// Ordering request would be made at most once per sub-chain (since this is
// only for Ordered HJ with unique left rows); the subchain is ordered now
// (first) --> (last) --> (before-last) --> ..... (second)
// so we reorder the subchain before returning, in a way similar to insert(),
// by reinserting all the rows following the first one.
if ( returnOrdered && matchCount > 2 ) {
HashRow *first = cursor->beginRow_;
// HashRow *second = cursor->endRow_ ;
HashRow *beyondLast = cursor->endRow_->next() ;
HashRow *last = cursor->beginRow_->next();
HashRow *tmpPtr = last->next(); // start from before-last
last->setNext( beyondLast );
cursor->endRow_ = last ; // current 2nd is the last
while ( tmpPtr != beyondLast ) {
HashRow *nextOne = tmpPtr->next();
tmpPtr->setNext(first->next());
first->setNext(tmpPtr);
tmpPtr = nextOne;
}
}
return;
}
else if ( noHashValueDups_ || hashTableOverflow_ ) {
// if HT overflow -- keep trying on this chain
if ( hashTableOverflow_ ) {
if ( row->next() && row->next()->hashValue() == hashValue )
continue; // stay on this chain; maybe it is the next row
}
// Same HV in this HT entry, but of different key-value, (or after a HT
// overflow, but at the end of the chain) then try next HT entry
cursor->currHeader_ = ++cursor->currHeader_ % headerCount_;
// set the dummyRow logically behind the first row on the next HT entry
dummyRow.next_ = header_[cursor->currHeader_].row_ ;
row = &dummyRow ; // so after "row = row->next_" it'll be the 1-st row
continue; // back to loop to handle the first row of the next HT entry
}
}
// No matching rows were found. Reset the cursor before returning.
cursor->init();
}
// HashTable::positionUniq()
// A position method used by UniqueHashJoin.
// Assumes entries are unique and so does not need to check for
// duplicates.
// Returns first match found.
//
ex_expr::exp_return_type HashTable::positionUniq(HashRow **currRow,
atp_struct * leftRowAtp,
atp_struct * workAtp,
short rightRowAtpIndex,
ex_expr * searchExpr,
SimpleHashValue hashValue)
{
// Get the first entry for this chain
//
ULng32 chainIdx = hashValue % headerCount_;
HashRow *row = header_[chainIdx].row_;
// Skip over entries that have a lower hash value.
//
while (row && hashValue > row->hashValueRaw()) {
row = row->next();
}
// Look for a match among those entries with the same hash value.
//
while (row && hashValue == row->hashValueRaw()) {
// Set the data pointer in the tuple to point to this row.
// The expression does not expect the HashRow to be part of the row.
// Adjust the datapointer in the work atp to point beyond the HashRow.
workAtp->getTupp(rightRowAtpIndex).setDataPointer(row->getData());
// If this row matches, then return this row
// No need to search further since we know the entries are unique.
//
ex_expr::exp_return_type retCode = searchExpr->eval(leftRowAtp, workAtp);
if(retCode != ex_expr::EXPR_FALSE) {
// Set the row pointer to the matching row.
*currRow = row;
// Found (or error condition).
return retCode;
}
// Try the next row.
row = row->next();
}
// If got to the end of the chain or an entry with a greater hash value
// then no match was found.
// Not found
return ex_expr::EXPR_FALSE;
}
/////////////////////////////////////////////////////////
// positions to the first entry in the hash table.
// Remember the last entry in the table, if table has
// entries.
// Used to retrieve all rows from the hash table by
// calling getNext().
/////////////////////////////////////////////////////////
void HashTable::position(HashTableCursor * cursor) {
// find the first non empty chain
ULng32 i = 0;
while ((i < headerCount_) && (!(header_[i].row_)))
i++;
if (i == headerCount_)
// empty table
cursor->init();
else {
cursor->beginRow_ = header_[i].row_;
cursor->currHeader_ = i;
// now find the last entry in the hash table
i = headerCount_ - 1;
// can never reach i < 0. Atleast one non-null chain must exist
while (!(header_[i].row_))
i--;
cursor->endRow_ = header_[i].row_;
while (cursor->endRow_->next_)
cursor->endRow_ = cursor->endRow_->next_;
}
cursor->currRow_ = cursor->beginRow_;
}
/////////////////////////////////////////////////////////
// positionSingleChain() is used by cross product to
// return a single list of all rows in the inner table
/////////////////////////////////////////////////////////
void HashTable::positionSingleChain(HashTableCursor * cursor) {
cursor->currHeader_ = 0;
cursor->currRow_ = cursor->beginRow_ = header_[0].row_;
cursor->endRow_ = singleChainLastRow_;
}
/////////////////////////////////////////////////////////
// determine the length of hash chain chainIdx
/////////////////////////////////////////////////////////
ULng32 HashTable::getChainSize(ULng32 chainIdx) {
// sanity check
ex_assert(chainIdx < headerCount_, "out of bound hash table access");
ULng32 count = 0;
HashRow * row = header_[chainIdx].row_;
while(row) {
count++;
row = row->next();
}
return count;
}