blob: d227da3fde38cee630ddd0103fcc813f7f0dfb8e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/primary_key_index.h"
#include <butil/time.h>
#include <gen_cpp/segment_v2.pb.h>
#include <utility>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/config.h"
#include "io/fs/file_writer.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/bloom_filter_index_reader.h"
#include "olap/rowset/segment_v2/bloom_filter_index_writer.h"
#include "olap/rowset/segment_v2/encoding_info.h"
#include "olap/types.h"
namespace doris {
static bvar::Adder<size_t> g_primary_key_index_memory_bytes("doris_primary_key_index_memory_bytes");
Status PrimaryKeyIndexBuilder::init() {
// TODO(liaoxin) using the column type directly if there's only one column in unique key columns
const auto* type_info = get_scalar_type_info<FieldType::OLAP_FIELD_TYPE_VARCHAR>();
segment_v2::IndexedColumnWriterOptions options;
options.write_ordinal_index = true;
options.write_value_index = true;
options.data_page_size = config::primary_key_data_page_size;
options.encoding = segment_v2::EncodingInfo::get_default_encoding(type_info->type(), true);
options.compression = segment_v2::ZSTD;
_primary_key_index_builder.reset(
new segment_v2::IndexedColumnWriter(options, type_info, _file_writer));
RETURN_IF_ERROR(_primary_key_index_builder->init());
auto opt = segment_v2::BloomFilterOptions();
opt.fpp = 0.01;
RETURN_IF_ERROR(segment_v2::PrimaryKeyBloomFilterIndexWriterImpl::create(
opt, type_info, &_bloom_filter_index_builder));
return Status::OK();
}
Status PrimaryKeyIndexBuilder::add_item(const Slice& key) {
RETURN_IF_ERROR(_primary_key_index_builder->add(&key));
Slice key_without_seq = Slice(key.get_data(), key.get_size() - _seq_col_length - _rowid_length);
RETURN_IF_ERROR(_bloom_filter_index_builder->add_values(&key_without_seq, 1));
// the key is already sorted, so the first key is min_key, and
// the last key is max_key.
if (UNLIKELY(_num_rows == 0)) {
_min_key.append(key.get_data(), key.get_size());
}
DCHECK(key.compare(_max_key) > 0)
<< "found duplicate key or key is not sorted! current key: " << key
<< ", last max key: " << _max_key;
_max_key.clear();
_max_key.append(key.get_data(), key.get_size());
_num_rows++;
_size += key.get_size();
return Status::OK();
}
Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) {
// finish primary key index
RETURN_IF_ERROR(_primary_key_index_builder->finish(meta->mutable_primary_key_index()));
_disk_size += _primary_key_index_builder->disk_size();
// set min_max key, the sequence column should be removed
meta->set_min_key(min_key().to_string());
meta->set_max_key(max_key().to_string());
// finish bloom filter index
RETURN_IF_ERROR(_bloom_filter_index_builder->flush());
uint64_t start_size = _file_writer->bytes_appended();
RETURN_IF_ERROR(
_bloom_filter_index_builder->finish(_file_writer, meta->mutable_bloom_filter_index()));
_disk_size += _file_writer->bytes_appended() - start_size;
_primary_key_index_builder.reset(nullptr);
_bloom_filter_index_builder.reset(nullptr);
return Status::OK();
}
Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader,
const segment_v2::PrimaryKeyIndexMetaPB& meta,
OlapReaderStatistics* pk_index_load_stats) {
// parse primary key index
_index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index()));
_index_reader->set_is_pk_index(true);
RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false,
pk_index_load_stats));
_index_parsed = true;
return Status::OK();
}
Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader,
const segment_v2::PrimaryKeyIndexMetaPB& meta,
OlapReaderStatistics* pk_index_load_stats) {
// parse bloom filter
segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index();
segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader),
column_index_meta.bloom_filter_index());
RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false,
pk_index_load_stats));
std::unique_ptr<segment_v2::BloomFilterIndexIterator> bf_iter;
RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter, pk_index_load_stats));
RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf));
segment_v2::g_pk_total_bloom_filter_num << 1;
segment_v2::g_pk_total_bloom_filter_total_bytes << _bf->size();
segment_v2::g_pk_read_bloom_filter_num << 1;
segment_v2::g_pk_read_bloom_filter_total_bytes << _bf->size();
_bf_num += 1;
_bf_bytes += _bf->size();
_bf_parsed = true;
return Status::OK();
}
} // namespace doris