depends/dbcommon/test/unit/test-hash-table.cc - hawq - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #include <map>

 #include "dbcommon/common/tuple-batch.h"
 #include "dbcommon/common/tuple-desc.h"
 #include "dbcommon/hash/hash-keys.h"
 #include "dbcommon/hash/native-hash-table.h"
 #include "dbcommon/log/debug-logger.h"
 #include "dbcommon/testutil/tuple-batch-utils.h"
 #include "gtest/gtest.h"

 namespace dbcommon {

 /*
  * In this test, we are going to check all the functions in native-hash-table.h
  * group-keys.h, and the related checkGroupByKey/checkGroupByKeys in Vector.
  * We need to check the combination of (with nulls/no nulls) * (direct
  * encoding/dictionary encoding) * (with select list/ without select list).
  */

 class TestHashTable : public ::testing::Test {
  public:
   TestHashTable() {
     desc = TupleBatchUtility::generateTupleDesc(
         "schema: "
         "boolean "
         "int8 int16 int32 int64 "
         "float double decimal decimal_new "
         "varchar varchar(5) string "
         "binary "
         "timestamp time date");
   }

   void test(std::vector<uint64_t> grpIdxs) {
     std::string hashkey;
     for (auto idx : grpIdxs) {
       hashkey = hashkey + ' ' +
                 TypeUtil::instance()
                     ->getTypeEntryById(desc->getColumnType(idx - 1))
                     ->name;
     }
     LOG_TESTING("hash key: %s", hashkey.c_str());
     hashTable.reset(new HashTable(*desc, grpIdxs, 1));
     testInsert(grpIdxs, true, true);
     testInsert(grpIdxs, true, false);
     testInsert(grpIdxs, false, true);
     testInsert(grpIdxs, false, false);

     testCheck(grpIdxs, true, true);
     testCheck(grpIdxs, true, false);
     testCheck(grpIdxs, true, true);
     testCheck(grpIdxs, false, false);
   }

  private:
   typedef NativeAggHashTable HashTable;

   void resetTupleBatch(bool hasNull, bool withSelectList) {
     batch = TupleBatchUtility::generateTupleBatchRandom(*desc, 1, 20, hasNull,
                                                         withSelectList);
   }

   void resetAnsForInsert(std::vector<uint64_t> grpIdxs) {
     groupKeys.clear();
     ans.clear();
     for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
       std::string groupKey = "";
       for (auto grpIdx : grpIdxs) {
         bool null;
         std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
         if (!null) groupKey += tmp;
         groupKey += null ? '$' : '@';
         // LOG_INFO("%s", groupKey.c_str());
       }
       if (groupKeys.find(groupKey) != groupKeys.end()) {
         ans.push_back(groupKeys[groupKey]);
       } else {
         ans.push_back(groupKeys.size());
         groupKeys[groupKey] = ans.back();
       }
     }
   }

   void testInsert(std::vector<uint64_t> grpIdxs, bool hasNull,
                   bool withSelectList) {
     LOG_TESTING("Insert TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
                 (withSelectList ? "with" : "without"));
     resetTupleBatch(hasNull, withSelectList);
     resetAnsForInsert(grpIdxs);
     hashTable->reset();
     for (int iter = 0; iter < 2; iter++) {
       std::vector<uint64_t> ret = hashTable->insert(batch.get());
       EXPECT_EQ(ans, ret);
     }
   }

   void resetAnsForCheck(std::vector<uint64_t> grpIdxs) {
     ans.clear();
     for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
       std::string groupKey = "";
       for (auto grpIdx : grpIdxs) {
         bool null;
         std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
         if (!null) groupKey += tmp;
         groupKey += null ? '$' : '@';
         // LOG_INFO("%s", groupKey.c_str());
       }
       if (groupKeys.find(groupKey) != groupKeys.end()) {
         ans.push_back(groupKeys[groupKey]);
       } else {
         ans.push_back(NOT_IN_HASHTABLE);
       }
     }
   }

   void testCheck(std::vector<uint64_t> grpIdxs, bool hasNull,
                  bool withSelectList) {
     LOG_TESTING("Check TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
                 (withSelectList ? "with" : "without"));
     resetTupleBatch(hasNull, withSelectList);
     resetAnsForCheck(grpIdxs);
     std::vector<uint64_t> ret = hashTable->check(batch.get());
     EXPECT_EQ(ans, ret);
   }

   dbcommon::TupleDesc::uptr desc;
   dbcommon::TupleBatch::uptr batch;
   std::map<std::string, uint64_t> groupKeys;
   std::vector<uint64_t> ans;
   std::unique_ptr<HashTable> hashTable;
 };

 TEST_F(TestHashTable, TestHashOnMutipleColumn) {
   for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++)
     for (uint64_t j = i + 1; j <= desc->getNumOfColumns(); j++)
       for (uint64_t k = j + 1; k <= desc->getNumOfColumns(); k++)
         test({i, j, k});
 }

 TEST_F(TestHashTable, TestHashOnSingleColumn) {
   for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++) test({i});
 }

 // Also test internal NormalAccess and QuickAccess at the same time.
 TEST_F(TestHashTable, TestResize) {
   dbcommon::TupleBatchUtility tbu;
   dbcommon::TupleDesc::uptr desc = (std::move(tbu.generateTupleDesc("ifs")));
   std::unique_ptr<HashTable> hashTable(new HashTable(*desc, {1, 2, 3}, 1));

   {
     dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
     dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
     std::vector<uint64_t> ans;
     for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
       writer[0]->append(std::to_string(i), false);
       writer[1]->append("3.14", false);
       writer[2]->append("hah", false);
       ans.push_back(i);
     }
     batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
     std::vector<uint64_t> ret = hashTable->insert(batch.get());
     EXPECT_EQ(ans, ret);
     EXPECT_GE(
         DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
         hashTable->getMemUsed());
   }

   {  // after inserted one tuple batch, resize is coming
     dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
     dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
     std::vector<uint64_t> ans;
     for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
       writer[0]->append("0", false);
       writer[1]->append("3.14", false);
       writer[2]->append(std::to_string(i), false);
       ans.push_back(i + DEFAULT_NUMBER_TUPLES_PER_BATCH);
     }
     batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
     std::vector<uint64_t> ret = hashTable->insert(batch.get());
     EXPECT_EQ(ans, ret);
     ret = hashTable->insert(batch.get());
     EXPECT_EQ(ans, ret);
     EXPECT_GE(
         DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
         hashTable->getMemUsed());
   }
 }

 TEST_F(TestHashTable, TestGetHashkeys) {
   TupleBatchUtility tbu;
   auto desc = tbu.generateTupleDesc(
       "schema: "
       "boolean "
       "int8 int16 int32 int64 "
       "float double decimal_new "
       "bpchar bpchar(10) varchar varchar(5) string "
       "binary "
       "timestamp time date");
   std::vector<uint64_t> grpIdxs(desc->getNumOfColumns());
   std::iota(grpIdxs.begin(), grpIdxs.end(), 1);
   NativeAggHashTable aggHashtable(*desc, grpIdxs, 0);
   NativeJoinHashTable joinHashtable(*desc, grpIdxs, 0);
   joinHashtable.setupJoin(true, *desc, grpIdxs, *desc, grpIdxs);

   auto tb = tbu.generateTupleBatchRandom(*desc, 0, 10, true);
   aggHashtable.insert(tb.get());
   joinHashtable.insert(tb.get());
   auto agghashkeys = aggHashtable.getHashKeys();
   auto joinhashkeys = joinHashtable.getHashKeys();
   EXPECT_EQ(agghashkeys->toString(), joinhashkeys->toString());
 }

 }  // namespace dbcommon
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#include <map>

	#include "dbcommon/common/tuple-batch.h"
	#include "dbcommon/common/tuple-desc.h"
	#include "dbcommon/hash/hash-keys.h"
	#include "dbcommon/hash/native-hash-table.h"
	#include "dbcommon/log/debug-logger.h"
	#include "dbcommon/testutil/tuple-batch-utils.h"
	#include "gtest/gtest.h"

	namespace dbcommon {

	/*
	* In this test, we are going to check all the functions in native-hash-table.h
	* group-keys.h, and the related checkGroupByKey/checkGroupByKeys in Vector.
	* We need to check the combination of (with nulls/no nulls) * (direct
	* encoding/dictionary encoding) * (with select list/ without select list).
	*/

	class TestHashTable : public ::testing::Test {
	public:
	TestHashTable() {
	desc = TupleBatchUtility::generateTupleDesc(
	"schema: "
	"boolean "
	"int8 int16 int32 int64 "
	"float double decimal decimal_new "
	"varchar varchar(5) string "
	"binary "
	"timestamp time date");
	}

	void test(std::vector<uint64_t> grpIdxs) {
	std::string hashkey;
	for (auto idx : grpIdxs) {
	hashkey = hashkey + ' ' +
	TypeUtil::instance()
	->getTypeEntryById(desc->getColumnType(idx - 1))
	->name;
	}
	LOG_TESTING("hash key: %s", hashkey.c_str());
	hashTable.reset(new HashTable(*desc, grpIdxs, 1));
	testInsert(grpIdxs, true, true);
	testInsert(grpIdxs, true, false);
	testInsert(grpIdxs, false, true);
	testInsert(grpIdxs, false, false);

	testCheck(grpIdxs, true, true);
	testCheck(grpIdxs, true, false);
	testCheck(grpIdxs, true, true);
	testCheck(grpIdxs, false, false);
	}

	private:
	typedef NativeAggHashTable HashTable;

	void resetTupleBatch(bool hasNull, bool withSelectList) {
	batch = TupleBatchUtility::generateTupleBatchRandom(*desc, 1, 20, hasNull,
	withSelectList);
	}

	void resetAnsForInsert(std::vector<uint64_t> grpIdxs) {
	groupKeys.clear();
	ans.clear();
	for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
	std::string groupKey = "";
	for (auto grpIdx : grpIdxs) {
	bool null;
	std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
	if (!null) groupKey += tmp;
	groupKey += null ? '$' : '@';
	// LOG_INFO("%s", groupKey.c_str());
	}
	if (groupKeys.find(groupKey) != groupKeys.end()) {
	ans.push_back(groupKeys[groupKey]);
	} else {
	ans.push_back(groupKeys.size());
	groupKeys[groupKey] = ans.back();
	}
	}
	}

	void testInsert(std::vector<uint64_t> grpIdxs, bool hasNull,
	bool withSelectList) {
	LOG_TESTING("Insert TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
	(withSelectList ? "with" : "without"));
	resetTupleBatch(hasNull, withSelectList);
	resetAnsForInsert(grpIdxs);
	hashTable->reset();
	for (int iter = 0; iter < 2; iter++) {
	std::vector<uint64_t> ret = hashTable->insert(batch.get());
	EXPECT_EQ(ans, ret);
	}
	}

	void resetAnsForCheck(std::vector<uint64_t> grpIdxs) {
	ans.clear();
	for (int rowIdx = 0; rowIdx < batch->getNumOfRows(); rowIdx++) {
	std::string groupKey = "";
	for (auto grpIdx : grpIdxs) {
	bool null;
	std::string tmp = batch->vectors[grpIdx - 1]->read(rowIdx, &null);
	if (!null) groupKey += tmp;
	groupKey += null ? '$' : '@';
	// LOG_INFO("%s", groupKey.c_str());
	}
	if (groupKeys.find(groupKey) != groupKeys.end()) {
	ans.push_back(groupKeys[groupKey]);
	} else {
	ans.push_back(NOT_IN_HASHTABLE);
	}
	}
	}

	void testCheck(std::vector<uint64_t> grpIdxs, bool hasNull,
	bool withSelectList) {
	LOG_TESTING("Check TB: %s nulls\t%s select list", (hasNull ? "has" : "no"),
	(withSelectList ? "with" : "without"));
	resetTupleBatch(hasNull, withSelectList);
	resetAnsForCheck(grpIdxs);
	std::vector<uint64_t> ret = hashTable->check(batch.get());
	EXPECT_EQ(ans, ret);
	}

	dbcommon::TupleDesc::uptr desc;
	dbcommon::TupleBatch::uptr batch;
	std::map<std::string, uint64_t> groupKeys;
	std::vector<uint64_t> ans;
	std::unique_ptr<HashTable> hashTable;
	};

	TEST_F(TestHashTable, TestHashOnMutipleColumn) {
	for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++)
	for (uint64_t j = i + 1; j <= desc->getNumOfColumns(); j++)
	for (uint64_t k = j + 1; k <= desc->getNumOfColumns(); k++)
	test({i, j, k});
	}

	TEST_F(TestHashTable, TestHashOnSingleColumn) {
	for (uint64_t i = 1; i <= desc->getNumOfColumns(); i++) test({i});
	}

	// Also test internal NormalAccess and QuickAccess at the same time.
	TEST_F(TestHashTable, TestResize) {
	dbcommon::TupleBatchUtility tbu;
	dbcommon::TupleDesc::uptr desc = (std::move(tbu.generateTupleDesc("ifs")));
	std::unique_ptr<HashTable> hashTable(new HashTable(*desc, {1, 2, 3}, 1));

	{
	dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
	dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
	std::vector<uint64_t> ans;
	for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
	writer[0]->append(std::to_string(i), false);
	writer[1]->append("3.14", false);
	writer[2]->append("hah", false);
	ans.push_back(i);
	}
	batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
	std::vector<uint64_t> ret = hashTable->insert(batch.get());
	EXPECT_EQ(ans, ret);
	EXPECT_GE(
	DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
	hashTable->getMemUsed());
	}

	{ // after inserted one tuple batch, resize is coming
	dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
	dbcommon::TupleBatchWriter &writer = batch->getTupleBatchWriter();
	std::vector<uint64_t> ans;
	for (int i = 0; i < DEFAULT_NUMBER_TUPLES_PER_BATCH; i++) {
	writer[0]->append("0", false);
	writer[1]->append("3.14", false);
	writer[2]->append(std::to_string(i), false);
	ans.push_back(i + DEFAULT_NUMBER_TUPLES_PER_BATCH);
	}
	batch->incNumOfRows(DEFAULT_NUMBER_TUPLES_PER_BATCH);
	std::vector<uint64_t> ret = hashTable->insert(batch.get());
	EXPECT_EQ(ans, ret);
	ret = hashTable->insert(batch.get());
	EXPECT_EQ(ans, ret);
	EXPECT_GE(
	DEFAULT_SIZE_PER_HASHKEY_BLK + DEFAULT_SIZE_PER_HASH_CHAIN_BLK * 2,
	hashTable->getMemUsed());
	}
	}

	TEST_F(TestHashTable, TestGetHashkeys) {
	TupleBatchUtility tbu;
	auto desc = tbu.generateTupleDesc(
	"schema: "
	"boolean "
	"int8 int16 int32 int64 "
	"float double decimal_new "
	"bpchar bpchar(10) varchar varchar(5) string "
	"binary "
	"timestamp time date");
	std::vector<uint64_t> grpIdxs(desc->getNumOfColumns());
	std::iota(grpIdxs.begin(), grpIdxs.end(), 1);
	NativeAggHashTable aggHashtable(*desc, grpIdxs, 0);
	NativeJoinHashTable joinHashtable(*desc, grpIdxs, 0);
	joinHashtable.setupJoin(true, desc, grpIdxs, desc, grpIdxs);

	auto tb = tbu.generateTupleBatchRandom(*desc, 0, 10, true);
	aggHashtable.insert(tb.get());
	joinHashtable.insert(tb.get());
	auto agghashkeys = aggHashtable.getHashKeys();
	auto joinhashkeys = joinHashtable.getHashKeys();
	EXPECT_EQ(agghashkeys->toString(), joinhashkeys->toString());
	}

	} // namespace dbcommon