blob: ad8dfdfe3fddb642a0f9a9b1132c091a0a6f23dd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <map>
#include "dbcommon/common/tuple-batch.h"
#include "dbcommon/common/tuple-desc.h"
#include "dbcommon/hash/hash-keys.h"
#include "dbcommon/hash/native-hash-table.h"
#include "dbcommon/log/debug-logger.h"
#include "dbcommon/testutil/tuple-batch-utils.h"
#include "gtest/gtest.h"
namespace dbcommon {
/*
* In this test, we are going to check all the functions in native-hash-table.h
* group-keys.h, and the related checkGroupByKey/checkGroupByKeys in Vector.
* We need to check the combination of (with nulls/no nulls) * (direct
* encoding/dictionary encoding) * (with select list/ without select list).
*/
class TestHashTable : public ::testing::Test {
public:
TestHashTable() {
desc = TupleBatchUtility::generateTupleDesc(
"schema: "
"boolean "
"int8 int16 int32 int64 "
"float double decimal decimal_new "
"varchar varchar(5) string "
"binary "
"timestamp time date");
}
void test(std::vector<uint64_t> grpIdxs) {
std::string hashkey;
for (auto idx : grpIdxs) {
hashkey = hashkey + ' ' +
TypeUtil::instance()
->getTypeEntryById(desc->getColumnType(idx - 1))
->name;
}
LOG_TESTING("hash key: %s", hashkey.c_str());
hashTable.reset(new HashTable(*desc, grpIdxs, 1));
testInsert(grpIdxs, true, true);
testInsert(grpIdxs, true, false);
testInsert(grpIdxs, false, true);
testInsert(grpIdxs, false, false);
testCheck(grpIdxs, true, true);
testCheck(grpIdxs, true, false);
testCheck(grpIdxs, true, true);
testCheck(grpIdxs, false, false);
}
private:
typedef NativeAggHashTable HashTable;
void resetTupleBatch(bool hasNull, bool withSelectList) {
batch = TupleBatchUtility::generateTupleBatchRandom(*desc, 1, 20, hasNull,
withSelectList);
}
void resetAnsForInsert(std::vector<uint64_t> grpIdxs) {
groupKeys.clear();
ans.clear();
for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
std::string groupKey = "";
for (auto grpIdx : grpIdxs) {
bool null;
std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
if (!null) groupKey += tmp;
groupKey += null ? '$' : '@';
// LOG_INFO("%s", groupKey.c_str());
}
if (groupKeys.find(groupKey) != groupKeys.end()) {
ans.push_back(groupKeys[groupKey]);
} else {
ans.push_back(groupKeys.size());
groupKeys[groupKey] = ans.back();
}
}
}
void testInsert(std::vector<uint64_t> grpIdxs, bool hasNull,
bool withSelectList) {
LOG_TESTING("Insert TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
(withSelectList ? "with" : "without"));
resetTupleBatch(hasNull, withSelectList);
resetAnsForInsert(grpIdxs);
hashTable->reset();
for (int iter = 0; iter < 2; iter++) {
std::vector<uint64_t> ret = hashTable->insert(batch.get());
EXPECT_EQ(ans, ret);
}
}
void resetAnsForCheck(std::vector<uint64_t> grpIdxs) {
ans.clear();
for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
std::string groupKey = "";
for (auto grpIdx : grpIdxs) {
bool null;
std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
if (!null) groupKey += tmp;
groupKey += null ? '$' : '@';
// LOG_INFO("%s", groupKey.c_str());
}
if (groupKeys.find(groupKey) != groupKeys.end()) {
ans.push_back(groupKeys[groupKey]);
} else {
ans.push_back(NOT_IN_HASHTABLE);
}
}
}
void testCheck(std::vector<uint64_t> grpIdxs, bool hasNull,
bool withSelectList) {
LOG_TESTING("Check TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
(withSelectList ? "with" : "without"));
resetTupleBatch(hasNull, withSelectList);
resetAnsForCheck(grpIdxs);
std::vector<uint64_t> ret = hashTable->check(batch.get());
EXPECT_EQ(ans, ret);
}
dbcommon::TupleDesc::uptr desc;
dbcommon::TupleBatch::uptr batch;
std::map<std::string, uint64_t> groupKeys;
std::vector<uint64_t> ans;
std::unique_ptr<HashTable> hashTable;
};
TEST_F(TestHashTable, TestHashOnMutipleColumn) {
for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++)
for (uint64_t j = i + 1; j <= desc->getNumOfColumns(); j++)
for (uint64_t k = j + 1; k <= desc->getNumOfColumns(); k++)
test({i, j, k});
}
TEST_F(TestHashTable, TestHashOnSingleColumn) {
for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++) test({i});
}
// Also test internal NormalAccess and QuickAccess at the same time.
TEST_F(TestHashTable, TestResize) {
dbcommon::TupleBatchUtility tbu;
dbcommon::TupleDesc::uptr desc = (std::move(tbu.generateTupleDesc("ifs")));
std::unique_ptr<HashTable> hashTable(new HashTable(*desc, {1, 2, 3}, 1));
{
dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
std::vector<uint64_t> ans;
for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
writer[0]->append(std::to_string(i), false);
writer[1]->append("3.14", false);
writer[2]->append("hah", false);
ans.push_back(i);
}
batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
std::vector<uint64_t> ret = hashTable->insert(batch.get());
EXPECT_EQ(ans, ret);
EXPECT_GE(
DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
hashTable->getMemUsed());
}
{ // after inserted one tuple batch, resize is coming
dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
std::vector<uint64_t> ans;
for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
writer[0]->append("0", false);
writer[1]->append("3.14", false);
writer[2]->append(std::to_string(i), false);
ans.push_back(i + DEFAULT_NUMBER_TUPLES_PER_BATCH);
}
batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
std::vector<uint64_t> ret = hashTable->insert(batch.get());
EXPECT_EQ(ans, ret);
ret = hashTable->insert(batch.get());
EXPECT_EQ(ans, ret);
EXPECT_GE(
DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
hashTable->getMemUsed());
}
}
TEST_F(TestHashTable, TestGetHashkeys) {
TupleBatchUtility tbu;
auto desc = tbu.generateTupleDesc(
"schema: "
"boolean "
"int8 int16 int32 int64 "
"float double decimal_new "
"bpchar bpchar(10) varchar varchar(5) string "
"binary "
"timestamp time date");
std::vector<uint64_t> grpIdxs(desc->getNumOfColumns());
std::iota(grpIdxs.begin(), grpIdxs.end(), 1);
NativeAggHashTable aggHashtable(*desc, grpIdxs, 0);
NativeJoinHashTable joinHashtable(*desc, grpIdxs, 0);
joinHashtable.setupJoin(true, *desc, grpIdxs, *desc, grpIdxs);
auto tb = tbu.generateTupleBatchRandom(*desc, 0, 10, true);
aggHashtable.insert(tb.get());
joinHashtable.insert(tb.get());
auto agghashkeys = aggHashtable.getHashKeys();
auto joinhashkeys = joinHashtable.getHashKeys();
EXPECT_EQ(agghashkeys->toString(), joinhashkeys->toString());
}
} // namespace dbcommon