blob: c6a681a24442eddd02f4b0cc66b87bca0ce27283 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include <CLucene/debug/error.h>
#include <CLucene/debug/mem.h>
#include <CLucene/index/Term.h>
#include <CLucene/search/Query.h>
#include <CLucene/search/RangeQuery.h>
#include <CLucene/store/Directory.h>
#include <CLucene/store/IndexInput.h>
#include <CLucene/util/FutureArrays.h>
#include <CLucene/util/bkd/bkd_docid_iterator.h>
#include <CLucene/util/stringUtil.h>
#include <memory>
#include <ostream>
#include <roaring/roaring.hh>
#include <set>
#include <string>
#include "common/config.h"
#include "common/exception.h"
#include "common/logging.h"
#include "common/status.h"
#include "inverted_index_query_type.h"
#include "olap/field.h"
#include "olap/inverted_index_parser.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/index_file_reader.h"
#include "olap/rowset/segment_v2/index_reader_helper.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/query_factory.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_fs_directory.h"
#include "olap/rowset/segment_v2/inverted_index_iterator.h"
#include "olap/rowset/segment_v2/inverted_index_searcher.h"
#include "olap/types.h"
#include "runtime/runtime_state.h"
#include "util/faststring.h"
#include "util/runtime_profile.h"
#include "vec/common/string_ref.h"
namespace doris::segment_v2 {
#include "common/compile_check_begin.h"
std::string InvertedIndexReader::get_index_file_path() {
return _index_file_reader->get_index_file_path(&_index_meta);
}
Status InvertedIndexReader::read_null_bitmap(const IndexQueryContextPtr& context,
InvertedIndexQueryCacheHandle* cache_handle,
lucene::store::Directory* dir) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_null_bitmap_timer);
lucene::store::IndexInput* null_bitmap_in = nullptr;
bool owned_dir = false;
try {
// try to get query bitmap result from cache and return immediately on cache hit
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
InvertedIndexQueryCache::CacheKey cache_key {
index_file_key, "", InvertedIndexQueryType::UNKNOWN_QUERY, "null_bitmap"};
auto* cache = InvertedIndexQueryCache::instance();
if (cache->lookup(cache_key, cache_handle)) {
return Status::OK();
}
if (!dir) {
// TODO: ugly code here, try to refact.
auto st = _index_file_reader->init(config::inverted_index_read_buffer_size,
context->io_ctx);
if (!st.ok()) {
LOG(WARNING) << st;
return st;
}
auto directory = DORIS_TRY(_index_file_reader->open(&_index_meta, context->io_ctx));
dir = directory.release();
owned_dir = true;
}
// ownership of null_bitmap and its deletion will be transfered to cache
std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
const char* null_bitmap_file_name =
InvertedIndexDescriptor::get_temporary_null_bitmap_file_name();
if (dir->fileExists(null_bitmap_file_name)) {
null_bitmap_in = dir->openInput(null_bitmap_file_name);
auto null_bitmap_size = cast_set<int32_t>(null_bitmap_in->length());
faststring buf;
buf.resize(null_bitmap_size);
null_bitmap_in->readBytes(buf.data(), null_bitmap_size);
*null_bitmap = roaring::Roaring::read(reinterpret_cast<char*>(buf.data()), false);
null_bitmap->runOptimize();
cache->insert(cache_key, null_bitmap, cache_handle);
FINALIZE_INPUT(null_bitmap_in);
} else {
cache->insert(cache_key, null_bitmap, cache_handle);
}
if (owned_dir) {
FINALIZE_INPUT(dir);
}
} catch (CLuceneError& e) {
FINALLY_FINALIZE_INPUT(null_bitmap_in);
if (owned_dir) {
FINALLY_FINALIZE_INPUT(dir);
}
return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"Inverted index read null bitmap error occurred, reason={}", e.what());
}
return Status::OK();
}
bool InvertedIndexReader::handle_query_cache(const IndexQueryContextPtr& context,
InvertedIndexQueryCache* cache,
const InvertedIndexQueryCache::CacheKey& cache_key,
InvertedIndexQueryCacheHandle* cache_handler,
std::shared_ptr<roaring::Roaring>& bit_map) {
const auto& query_options = context->runtime_state->query_options();
bool cache_hit = false;
if (query_options.enable_inverted_index_query_cache) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_lookup_timer);
cache_hit = cache->lookup(cache_key, cache_handler);
}
if (cache_hit) {
DBUG_EXECUTE_IF("InvertedIndexReader.handle_query_cache_hit", {
return Status::Error<ErrorCode::INTERNAL_ERROR>("handle query cache hit");
});
context->stats->inverted_index_query_cache_hit++;
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_bitmap_copy_timer);
bit_map = cache_handler->get_bitmap();
return true;
}
DBUG_EXECUTE_IF("InvertedIndexReader.handle_query_cache_miss", {
return Status::Error<ErrorCode::INTERNAL_ERROR>("handle query cache miss");
});
context->stats->inverted_index_query_cache_miss++;
return false;
}
Status InvertedIndexReader::handle_searcher_cache(
const IndexQueryContextPtr& context,
InvertedIndexCacheHandle* inverted_index_cache_handle) {
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key);
const auto& query_options = context->runtime_state->query_options();
bool cache_hit = false;
if (query_options.enable_inverted_index_searcher_cache) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_lookup_timer);
cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key,
inverted_index_cache_handle);
}
if (cache_hit) {
DBUG_EXECUTE_IF("InvertedIndexReader.handle_searcher_cache_hit", {
return Status::Error<ErrorCode::INTERNAL_ERROR>("handle searcher cache hit");
});
context->stats->inverted_index_searcher_cache_hit++;
return Status::OK();
} else {
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_open_timer);
DBUG_EXECUTE_IF("InvertedIndexReader.handle_searcher_cache_miss", {
return Status::Error<ErrorCode::INTERNAL_ERROR>("handle searcher cache miss");
});
// searcher cache miss
context->stats->inverted_index_searcher_cache_miss++;
auto mem_tracker = std::make_unique<MemTracker>("InvertedIndexSearcherCacheWithRead");
IndexSearcherPtr searcher;
auto st =
_index_file_reader->init(config::inverted_index_read_buffer_size, context->io_ctx);
if (!st.ok()) {
LOG(WARNING) << st;
return st;
}
auto dir = DORIS_TRY(_index_file_reader->open(&_index_meta, context->io_ctx));
DBUG_EXECUTE_IF(
"InvertedIndexReader.handle_searcher_cache.io_ctx", ({
if (dir) {
auto* stream = dir->getDorisIndexInput();
const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext();
if (cur_io_ctx->file_cache_stats) {
if (cur_io_ctx->file_cache_stats != &context->stats->file_cache_stats) {
LOG(FATAL) << "io context file cache stats is not equal to "
"stats file cache "
"stats: "
<< cur_io_ctx->file_cache_stats << ", "
<< &context->stats->file_cache_stats;
}
}
}
}));
// try to reuse index_searcher's directory to read null_bitmap to cache
// to avoid open directory additionally for null_bitmap
// TODO: handle null bitmap procedure in new format.
InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
RETURN_IF_ERROR(read_null_bitmap(context, &null_bitmap_cache_handle, dir.get()));
size_t reader_size = 0;
auto index_searcher_builder =
DORIS_TRY(IndexSearcherBuilder::create_index_searcher_builder(type()));
RETURN_IF_ERROR(create_index_searcher(index_searcher_builder.get(), dir.get(), &searcher,
reader_size));
auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher),
reader_size, UnixMillis());
InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value,
inverted_index_cache_handle);
return Status::OK();
}
}
Status InvertedIndexReader::create_index_searcher(IndexSearcherBuilder* index_searcher_builder,
lucene::store::Directory* dir,
IndexSearcherPtr* searcher, size_t& reader_size) {
auto searcher_result = DORIS_TRY(index_searcher_builder->get_index_searcher(dir));
*searcher = searcher_result;
// When the meta information has been read, the ioContext needs to be reset to prevent it from being used by other queries.
auto* stream = static_cast<DorisCompoundReader*>(dir)->getDorisIndexInput();
stream->setIoContext(nullptr);
stream->setIndexFile(false);
reader_size = index_searcher_builder->get_reader_size();
return Status::OK();
};
Status InvertedIndexReader::match_index_search(
const IndexQueryContextPtr& context, InvertedIndexQueryType query_type,
const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher,
const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
auto* reader = index_searcher->getReader();
if (context->runtime_state &&
context->runtime_state->query_options().inverted_index_compatible_read) {
reader->setCompatibleRead(true);
}
try {
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_search_timer);
auto query = QueryFactory::create(query_type, index_searcher, context);
if (!query) {
return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
"query type " + query_type_to_string(query_type) + ", query is nullptr");
}
{
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_search_init_timer);
query->add(query_info);
}
{
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_search_exec_timer);
query->search(*term_match_bitmap);
}
} catch (const CLuceneError& e) {
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}",
e.what());
} catch (const Exception& e) {
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("Exception occured: {}",
e.what());
}
return Status::OK();
}
Status FullTextIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterator) {
if (*iterator == nullptr) {
*iterator = InvertedIndexIterator::create_unique();
}
dynamic_cast<InvertedIndexIterator*>(iterator->get())
->add_reader(InvertedIndexReaderType::FULLTEXT,
dynamic_pointer_cast<InvertedIndexReader>(shared_from_this()));
return Status::OK();
}
Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
const std::string& column_name, const void* query_value,
InvertedIndexQueryType query_type,
std::shared_ptr<roaring::Roaring>& bit_map,
const InvertedIndexAnalyzerCtx* analyzer_ctx) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
std::string search_str = *reinterpret_cast<const std::string*>(query_value);
VLOG_DEBUG << column_name << " begin to search the fulltext index from clucene, query_str ["
<< search_str << "]";
const auto& queryOptions = context->runtime_state->query_options();
try {
InvertedIndexQueryInfo query_info;
InvertedIndexQueryCache::CacheKey cache_key;
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
// terms
if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
query_info.term_infos.emplace_back(search_str, 0);
} else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
PhraseQuery::parser_info(context->stats, search_str, _index_meta.properties(),
query_info);
} else {
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
analyzer_ctx->char_filter_map);
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
query_info.term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
reader, analyzer_ctx->analyzer);
} else {
query_info.term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
search_str, _index_meta.properties());
}
}
if (query_info.term_infos.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
"please check your query: '{}' and index parser: '{}'",
search_str, get_parser_string_from_properties(_index_meta.properties()));
if (is_match_query(query_type)) {
LOG(WARNING) << msg;
return Status::OK();
} else {
return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>(msg);
}
}
// field_name
query_info.field_name = StringUtil::string_to_wstring(column_name);
// cache_key
std::string str_tokens = query_info.generate_tokens_key();
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
str_tokens += " " + std::to_string(query_info.slop);
str_tokens += " " + std::to_string(query_info.ordered);
} else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
str_tokens += " " + std::to_string(queryOptions.inverted_index_max_expansions);
}
cache_key = {index_file_key, column_name, query_type, std::move(str_tokens)};
if (IndexReaderHelper::is_need_similarity_score(query_type, &_index_meta)) {
query_info.is_similarity_score = true;
}
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
std::shared_ptr<roaring::Roaring> term_match_bitmap = nullptr;
// Queries that require scoring will not hit the cache
if (!(context->collection_similarity && query_info.is_similarity_score)) {
if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) {
return Status::OK();
}
}
InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(handle_searcher_cache(context, &inverted_index_cache_handle));
auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
if (searcher_ptr != nullptr) {
term_match_bitmap = std::make_shared<roaring::Roaring>();
RETURN_IF_ERROR(match_index_search(context, query_type, query_info, *searcher_ptr,
term_match_bitmap));
term_match_bitmap->runOptimize();
cache->insert(cache_key, term_match_bitmap, &cache_handler);
bit_map = term_match_bitmap;
}
return Status::OK();
} catch (const CLuceneError& e) {
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"CLuceneError occured, error msg: {}", e.what());
}
}
InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}
Status StringTypeInvertedIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterator) {
if (*iterator == nullptr) {
*iterator = InvertedIndexIterator::create_unique();
}
dynamic_cast<InvertedIndexIterator*>(iterator->get())
->add_reader(InvertedIndexReaderType::STRING_TYPE,
dynamic_pointer_cast<InvertedIndexReader>(shared_from_this()));
return Status::OK();
}
Status StringTypeInvertedIndexReader::query(const IndexQueryContextPtr& context,
const std::string& column_name, const void* query_value,
InvertedIndexQueryType query_type,
std::shared_ptr<roaring::Roaring>& bit_map,
const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
std::string search_str = *reinterpret_cast<const std::string*>(query_value);
// If the written value exceeds ignore_above, it will be written as null.
// The queried value exceeds ignore_above means the written value cannot be found.
// The query needs to be downgraded to read from the segment file.
if (int ignore_above =
std::stoi(get_parser_ignore_above_value_from_properties(_index_meta.properties()));
search_str.size() > ignore_above) {
return Status::Error<ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED>(
"query value is too long, evaluate skipped.");
}
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " << search_str;
try {
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
// try to get query bitmap result from cache and return immediately on cache hit
InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type,
search_str};
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) {
return Status::OK();
}
std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
InvertedIndexQueryInfo query_info;
query_info.field_name = column_name_ws;
query_info.term_infos.emplace_back(search_str, 0);
auto result = std::make_shared<roaring::Roaring>();
FulltextIndexSearcherPtr* searcher_ptr = nullptr;
InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(handle_searcher_cache(context, &inverted_index_cache_handle));
auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
if (searcher_ptr != nullptr) {
switch (query_type) {
case InvertedIndexQueryType::MATCH_ANY_QUERY:
case InvertedIndexQueryType::MATCH_ALL_QUERY:
case InvertedIndexQueryType::EQUAL_QUERY: {
RETURN_IF_ERROR(match_index_search(context, InvertedIndexQueryType::MATCH_ANY_QUERY,
query_info, *searcher_ptr, result));
break;
}
case InvertedIndexQueryType::MATCH_PHRASE_QUERY:
case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY:
case InvertedIndexQueryType::MATCH_REGEXP_QUERY: {
RETURN_IF_ERROR(
match_index_search(context, query_type, query_info, *searcher_ptr, result));
break;
}
case InvertedIndexQueryType::LESS_THAN_QUERY:
case InvertedIndexQueryType::LESS_EQUAL_QUERY:
case InvertedIndexQueryType::GREATER_THAN_QUERY:
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
// unique_ptr with custom deleter
std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
_CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()),
[](lucene::index::Term* term) { _CLDECDELETE(term); }};
std::unique_ptr<lucene::search::Query> query;
bool include_upper = query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY;
bool include_lower = query_type == InvertedIndexQueryType::GREATER_EQUAL_QUERY;
if (query_type == InvertedIndexQueryType::LESS_THAN_QUERY ||
query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY) {
query = std::make_unique<lucene::search::RangeQuery>(nullptr, term.get(),
include_upper);
} else { // GREATER_THAN_QUERY or GREATER_EQUAL_QUERY
query = std::make_unique<lucene::search::RangeQuery>(term.get(), nullptr,
include_lower);
}
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_search_timer);
(*searcher_ptr)
->_search(query.get(),
[&result](const int32_t docid, const float_t /*score*/) {
result->add(docid);
});
break;
}
default:
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
"invalid query type when query untokenized inverted index");
}
}
// add to cache
result->runOptimize();
cache->insert(cache_key, result, &cache_handler);
bit_map = result;
return Status::OK();
} catch (const CLuceneError& e) {
if (is_range_query(query_type) && e.number() == CL_ERR_TooManyClauses) {
return Status::Error<ErrorCode::INVERTED_INDEX_BYPASS>(
"range query term exceeds limits, try to downgrade from inverted index, "
"column "
"name:{}, search_str:{}",
column_name, search_str);
} else {
LOG(ERROR) << "CLuceneError occurred, error msg: " << e.what()
<< ", column name: " << column_name << ", search_str: " << search_str;
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"CLuceneError occurred, error msg: {}, column name: {}, search_str: {}",
e.what(), column_name, search_str);
}
}
}
InvertedIndexReaderType StringTypeInvertedIndexReader::type() {
return InvertedIndexReaderType::STRING_TYPE;
}
Status BkdIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterator) {
if (*iterator == nullptr) {
*iterator = InvertedIndexIterator::create_unique();
}
dynamic_cast<InvertedIndexIterator*>(iterator->get())
->add_reader(InvertedIndexReaderType::BKD,
dynamic_pointer_cast<InvertedIndexReader>(shared_from_this()));
return Status::OK();
}
template <InvertedIndexQueryType QT>
Status BkdIndexReader::construct_bkd_query_value(const void* query_value,
std::shared_ptr<lucene::util::bkd::bkd_reader> r,
InvertedIndexVisitor<QT>* visitor) {
std::vector<char> tmp(r->bytes_per_dim_);
if constexpr (QT == InvertedIndexQueryType::EQUAL_QUERY) {
_value_key_coder->full_encode_ascending(query_value, &visitor->query_max);
_value_key_coder->full_encode_ascending(query_value, &visitor->query_min);
} else if constexpr (QT == InvertedIndexQueryType::LESS_THAN_QUERY ||
QT == InvertedIndexQueryType::LESS_EQUAL_QUERY) {
_value_key_coder->full_encode_ascending(query_value, &visitor->query_max);
_type_info->set_to_min(tmp.data());
_value_key_coder->full_encode_ascending(tmp.data(), &visitor->query_min);
} else if constexpr (QT == InvertedIndexQueryType::GREATER_THAN_QUERY ||
QT == InvertedIndexQueryType::GREATER_EQUAL_QUERY) {
_value_key_coder->full_encode_ascending(query_value, &visitor->query_min);
_type_info->set_to_max(tmp.data());
_value_key_coder->full_encode_ascending(tmp.data(), &visitor->query_max);
} else {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
"invalid query type when query bkd index");
}
return Status::OK();
}
Status BkdIndexReader::invoke_bkd_try_query(const IndexQueryContextPtr& context,
const void* query_value,
InvertedIndexQueryType query_type,
std::shared_ptr<lucene::util::bkd::bkd_reader> r,
size_t* count) {
switch (query_type) {
case InvertedIndexQueryType::LESS_THAN_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::LESS_THAN_QUERY>>(
context->io_ctx, r.get(), nullptr, true);
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
*count = r->estimate_point_count(visitor.get());
break;
}
case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::LESS_EQUAL_QUERY>>(
context->io_ctx, r.get(), nullptr, true);
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
*count = r->estimate_point_count(visitor.get());
break;
}
case InvertedIndexQueryType::GREATER_THAN_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::GREATER_THAN_QUERY>>(
context->io_ctx, r.get(), nullptr, true);
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
*count = r->estimate_point_count(visitor.get());
break;
}
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::GREATER_EQUAL_QUERY>>(
context->io_ctx, r.get(), nullptr, true);
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
*count = r->estimate_point_count(visitor.get());
break;
}
case InvertedIndexQueryType::EQUAL_QUERY: {
auto visitor = std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::EQUAL_QUERY>>(
context->io_ctx, r.get(), nullptr, true);
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
*count = r->estimate_point_count(visitor.get());
break;
}
default:
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>("Invalid query type");
}
return Status::OK();
}
Status BkdIndexReader::invoke_bkd_query(const IndexQueryContextPtr& context,
const void* query_value, InvertedIndexQueryType query_type,
std::shared_ptr<lucene::util::bkd::bkd_reader> r,
std::shared_ptr<roaring::Roaring>& bit_map) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_searcher_search_timer);
switch (query_type) {
case InvertedIndexQueryType::LESS_THAN_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::LESS_THAN_QUERY>>(
context->io_ctx, r.get(), bit_map.get());
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
r->intersect(visitor.get());
break;
}
case InvertedIndexQueryType::LESS_EQUAL_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::LESS_EQUAL_QUERY>>(
context->io_ctx, r.get(), bit_map.get());
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
r->intersect(visitor.get());
break;
}
case InvertedIndexQueryType::GREATER_THAN_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::GREATER_THAN_QUERY>>(
context->io_ctx, r.get(), bit_map.get());
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
r->intersect(visitor.get());
break;
}
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
auto visitor =
std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::GREATER_EQUAL_QUERY>>(
context->io_ctx, r.get(), bit_map.get());
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
r->intersect(visitor.get());
break;
}
case InvertedIndexQueryType::EQUAL_QUERY: {
auto visitor = std::make_unique<InvertedIndexVisitor<InvertedIndexQueryType::EQUAL_QUERY>>(
context->io_ctx, r.get(), bit_map.get());
RETURN_IF_ERROR(construct_bkd_query_value(query_value, r, visitor.get()));
r->intersect(visitor.get());
break;
}
default:
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>("Invalid query type");
}
return Status::OK();
}
Status BkdIndexReader::try_query(const IndexQueryContextPtr& context,
const std::string& column_name, const void* query_value,
InvertedIndexQueryType query_type, size_t* count) {
try {
std::shared_ptr<lucene::util::bkd::bkd_reader> r;
auto st = get_bkd_reader(context, r);
if (!st.ok()) {
LOG(WARNING) << "get bkd reader for "
<< _index_file_reader->get_index_file_path(&_index_meta)
<< " failed: " << st;
return st;
}
std::string query_str;
_value_key_coder->full_encode_ascending(query_value, &query_str);
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type,
query_str};
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
std::shared_ptr<roaring::Roaring> bit_map;
if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) {
*count = bit_map->cardinality();
return Status::OK();
}
return invoke_bkd_try_query(context, query_value, query_type, r, count);
} catch (const CLuceneError& e) {
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"BKD Query CLuceneError Occurred, error msg: {}", e.what());
}
VLOG_DEBUG << "BKD index try search column: " << column_name << " result: " << *count;
return Status::OK();
}
Status BkdIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name,
const void* query_value, InvertedIndexQueryType query_type,
std::shared_ptr<roaring::Roaring>& bit_map,
const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/) {
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
try {
std::shared_ptr<lucene::util::bkd::bkd_reader> r;
auto st = get_bkd_reader(context, r);
if (!st.ok()) {
LOG(WARNING) << "get bkd reader for "
<< _index_file_reader->get_index_file_path(&_index_meta)
<< " failed: " << st;
return st;
}
std::string query_str;
_value_key_coder->full_encode_ascending(query_value, &query_str);
auto index_file_key = _index_file_reader->get_index_file_cache_key(&_index_meta);
InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type,
query_str};
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
if (handle_query_cache(context, cache, cache_key, &cache_handler, bit_map)) {
return Status::OK();
}
RETURN_IF_ERROR(invoke_bkd_query(context, query_value, query_type, r, bit_map));
bit_map->runOptimize();
cache->insert(cache_key, bit_map, &cache_handler);
VLOG_DEBUG << "BKD index search column: " << column_name
<< " result: " << bit_map->cardinality();
return Status::OK();
} catch (const CLuceneError& e) {
LOG(ERROR) << "BKD Query CLuceneError Occurred, error msg: " << e.what()
<< " file_path:" << _index_file_reader->get_index_file_path(&_index_meta);
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"BKD Query CLuceneError Occurred, error msg: {}", e.what());
}
}
Status BkdIndexReader::get_bkd_reader(const IndexQueryContextPtr& context,
BKDIndexSearcherPtr& bkd_reader) {
BKDIndexSearcherPtr* bkd_searcher = nullptr;
InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(handle_searcher_cache(context, &inverted_index_cache_handle));
auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
bkd_searcher = std::get_if<BKDIndexSearcherPtr>(&searcher_variant);
if (bkd_searcher) {
_type_info = get_scalar_type_info((FieldType)(*bkd_searcher)->type);
if (_type_info == nullptr) {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
"unsupported typeinfo, type={}", (*bkd_searcher)->type);
}
_value_key_coder = get_key_coder(_type_info->type());
bkd_reader = *bkd_searcher;
if (bkd_reader->bytes_per_dim_ == 0) {
bkd_reader->bytes_per_dim_ = cast_set<int32_t>(_type_info->size());
}
return Status::OK();
}
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"get bkd reader from searcher cache builder error");
}
InvertedIndexReaderType BkdIndexReader::type() {
return InvertedIndexReaderType::BKD;
}
template <InvertedIndexQueryType QT>
InvertedIndexVisitor<QT>::InvertedIndexVisitor(const void* io_ctx, lucene::util::bkd::bkd_reader* r,
roaring::Roaring* h, bool only_count)
: _io_ctx(io_ctx), _hits(h), _num_hits(0), _only_count(only_count), _reader(r) {}
template <InvertedIndexQueryType QT>
int InvertedIndexVisitor<QT>::matches(uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
bool all_greater_than_max = true;
bool all_within_range = true;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result_max = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset, offset + _reader->bytes_per_dim_);
auto result_min = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset, offset + _reader->bytes_per_dim_);
all_greater_than_max &= (result_max > 0);
all_within_range &= (result_min > 0 && result_max < 0);
if (!all_greater_than_max && !all_within_range) {
return -1;
}
}
if (all_greater_than_max) {
return 1;
} else if (all_within_range) {
return 0;
} else {
return -1;
}
}
template <>
int InvertedIndexVisitor<InvertedIndexQueryType::EQUAL_QUERY>::matches(uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
// if query type is equal, query_min == query_max
if (_reader->num_data_dims_ == 1) {
return std::memcmp(packed_value, (const uint8_t*)query_min.c_str(),
_reader->bytes_per_dim_);
} else {
// if all dim value > matched value, then return > 0, otherwise return < 0
int return_result = 0;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset, offset + _reader->bytes_per_dim_);
if (result < 0) {
return -1;
} else if (result > 0) {
return_result = 1;
}
}
return return_result;
}
}
template <>
int InvertedIndexVisitor<InvertedIndexQueryType::LESS_THAN_QUERY>::matches(uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
if (_reader->num_data_dims_ == 1) {
auto result = std::memcmp(packed_value, (const uint8_t*)query_max.c_str(),
_reader->bytes_per_dim_);
if (result >= 0) {
return 1;
}
return 0;
} else {
bool all_greater_or_equal = true;
bool all_lesser = true;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset, offset + _reader->bytes_per_dim_);
all_greater_or_equal &=
(result >= 0); // Remains true only if all results are greater or equal
all_lesser &= (result < 0); // Remains true only if all results are lesser
}
// Return 1 if all values are greater or equal, 0 if all are lesser, otherwise -1
return all_greater_or_equal ? 1 : (all_lesser ? 0 : -1);
}
}
template <>
int InvertedIndexVisitor<InvertedIndexQueryType::LESS_EQUAL_QUERY>::matches(uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
if (_reader->num_data_dims_ == 1) {
auto result = std::memcmp(packed_value, (const uint8_t*)query_max.c_str(),
_reader->bytes_per_dim_);
if (result > 0) {
return 1;
}
return 0;
} else {
bool all_greater = true;
bool all_lesser_or_equal = true;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset, offset + _reader->bytes_per_dim_);
all_greater &= (result > 0); // Remains true only if all results are greater
all_lesser_or_equal &=
(result <= 0); // Remains true only if all results are lesser or equal
}
// Return 1 if all values are greater or equal, 0 if all are lesser, otherwise -1
return all_greater ? 1 : (all_lesser_or_equal ? 0 : -1);
}
}
template <>
int InvertedIndexVisitor<InvertedIndexQueryType::GREATER_THAN_QUERY>::matches(
uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
if (_reader->num_data_dims_ == 1) {
auto result = std::memcmp(packed_value, (const uint8_t*)query_min.c_str(),
_reader->bytes_per_dim_);
if (result <= 0) {
return -1;
}
return 0;
} else {
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset, offset + _reader->bytes_per_dim_);
if (result <= 0) {
return -1;
}
}
return 0;
}
}
template <>
int InvertedIndexVisitor<InvertedIndexQueryType::GREATER_EQUAL_QUERY>::matches(
uint8_t* packed_value) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
if (_reader->num_data_dims_ == 1) {
auto result = std::memcmp(packed_value, (const uint8_t*)query_min.c_str(),
_reader->bytes_per_dim_);
if (result < 0) {
return -1;
}
return 0;
} else {
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
auto result = lucene::util::FutureArrays::CompareUnsigned(
packed_value, offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset, offset + _reader->bytes_per_dim_);
if (result < 0) {
return -1;
}
}
return 0;
}
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(std::vector<char>& doc_id,
std::vector<uint8_t>& packed_value) {
if (matches(packed_value.data()) != 0) {
return;
}
visit(roaring::Roaring::read(doc_id.data(), false));
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(roaring::Roaring* doc_id, std::vector<uint8_t>& packed_value) {
if (matches(packed_value.data()) != 0) {
return;
}
visit(*doc_id);
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(roaring::Roaring&& r) {
if (_only_count) {
_num_hits += r.cardinality();
} else {
*_hits |= r;
}
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(roaring::Roaring& r) {
if (_only_count) {
_num_hits += r.cardinality();
} else {
*_hits |= r;
}
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(int row_id) {
if (_only_count) {
_num_hits++;
} else {
_hits->add(row_id);
}
}
template <InvertedIndexQueryType QT>
void InvertedIndexVisitor<QT>::visit(lucene::util::bkd::bkd_docid_set_iterator* iter,
std::vector<uint8_t>& packed_value) {
if (matches(packed_value.data()) != 0) {
return;
}
int32_t doc_id = iter->docid_set->nextDoc();
while (doc_id != lucene::util::bkd::bkd_docid_set::NO_MORE_DOCS) {
if (_only_count) {
_num_hits++;
} else {
_hits->add(doc_id);
}
doc_id = iter->docid_set->nextDoc();
}
}
template <InvertedIndexQueryType QT>
int InvertedIndexVisitor<QT>::visit(int row_id, std::vector<uint8_t>& packed_value) {
auto result = matches(packed_value.data());
if (result != 0) {
return result;
}
if (_only_count) {
_num_hits++;
} else {
_hits->add(row_id);
}
return 0;
}
template <>
lucene::util::bkd::relation InvertedIndexVisitor<InvertedIndexQueryType::LESS_THAN_QUERY>::compare(
std::vector<uint8_t>& min_packed, std::vector<uint8_t>& max_packed) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
bool crosses = false;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
if (lucene::util::FutureArrays::CompareUnsigned(min_packed.data(), offset,
offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset,
offset + _reader->bytes_per_dim_) >= 0) {
return lucene::util::bkd::relation::CELL_OUTSIDE_QUERY;
}
crosses |= lucene::util::FutureArrays::CompareUnsigned(
min_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset,
offset + _reader->bytes_per_dim_) <= 0 ||
lucene::util::FutureArrays::CompareUnsigned(
max_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset,
offset + _reader->bytes_per_dim_) >= 0;
}
if (crosses) {
return lucene::util::bkd::relation::CELL_CROSSES_QUERY;
} else {
return lucene::util::bkd::relation::CELL_INSIDE_QUERY;
}
}
template <>
lucene::util::bkd::relation
InvertedIndexVisitor<InvertedIndexQueryType::GREATER_THAN_QUERY>::compare(
std::vector<uint8_t>& min_packed, std::vector<uint8_t>& max_packed) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
bool crosses = false;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
if (lucene::util::FutureArrays::CompareUnsigned(max_packed.data(), offset,
offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset,
offset + _reader->bytes_per_dim_) <= 0) {
return lucene::util::bkd::relation::CELL_OUTSIDE_QUERY;
}
crosses |= lucene::util::FutureArrays::CompareUnsigned(
min_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset,
offset + _reader->bytes_per_dim_) <= 0 ||
lucene::util::FutureArrays::CompareUnsigned(
max_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset,
offset + _reader->bytes_per_dim_) >= 0;
}
if (crosses) {
return lucene::util::bkd::relation::CELL_CROSSES_QUERY;
} else {
return lucene::util::bkd::relation::CELL_INSIDE_QUERY;
}
}
template <InvertedIndexQueryType QT>
bkd::relation InvertedIndexVisitor<QT>::compare_prefix(std::vector<uint8_t>& prefix) {
const int32_t length = cast_set<int32_t>(prefix.size());
const uint8_t* data = prefix.data();
auto cmp = [&](const std::string& bound) {
return lucene::util::FutureArrays::CompareUnsigned(
data, 0, length, reinterpret_cast<const uint8_t*>(bound.data()), 0, length);
};
int32_t cmpMax = cmp(query_max);
int32_t cmpMin = cmp(query_min);
if (cmpMax > 0 || cmpMin < 0) {
return bkd::relation::CELL_OUTSIDE_QUERY;
}
if (cmpMin > 0 && cmpMax < 0) {
return bkd::relation::CELL_INSIDE_QUERY;
}
return bkd::relation::CELL_CROSSES_QUERY;
}
template <InvertedIndexQueryType QT>
lucene::util::bkd::relation InvertedIndexVisitor<QT>::compare(std::vector<uint8_t>& min_packed,
std::vector<uint8_t>& max_packed) {
if (UNLIKELY(_reader == nullptr)) {
throw CLuceneError(CL_ERR_NullPointer, "bkd index reader is null", false);
}
bool crosses = false;
for (int dim = 0; dim < _reader->num_data_dims_; dim++) {
int offset = dim * _reader->bytes_per_dim_;
if (lucene::util::FutureArrays::CompareUnsigned(min_packed.data(), offset,
offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset,
offset + _reader->bytes_per_dim_) > 0 ||
lucene::util::FutureArrays::CompareUnsigned(max_packed.data(), offset,
offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset,
offset + _reader->bytes_per_dim_) < 0) {
return lucene::util::bkd::relation::CELL_OUTSIDE_QUERY;
}
crosses |= lucene::util::FutureArrays::CompareUnsigned(
min_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_min.c_str(), offset,
offset + _reader->bytes_per_dim_) < 0 ||
lucene::util::FutureArrays::CompareUnsigned(
max_packed.data(), offset, offset + _reader->bytes_per_dim_,
(const uint8_t*)query_max.c_str(), offset,
offset + _reader->bytes_per_dim_) > 0;
}
if (crosses) {
return lucene::util::bkd::relation::CELL_CROSSES_QUERY;
} else {
return lucene::util::bkd::relation::CELL_INSIDE_QUERY;
}
}
template class InvertedIndexVisitor<InvertedIndexQueryType::LESS_THAN_QUERY>;
template class InvertedIndexVisitor<InvertedIndexQueryType::EQUAL_QUERY>;
template class InvertedIndexVisitor<InvertedIndexQueryType::LESS_EQUAL_QUERY>;
template class InvertedIndexVisitor<InvertedIndexQueryType::GREATER_THAN_QUERY>;
template class InvertedIndexVisitor<InvertedIndexQueryType::GREATER_EQUAL_QUERY>;
} // namespace doris::segment_v2
#include "common/compile_check_end.h"