blob: c981a3ac90461e075ce763fcc6852c34327cdb1b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "olap/task/engine_clone_task.h"
#include <absl/strings/str_split.h>
#include <curl/curl.h>
#include <fcntl.h>
#include <fmt/format.h>
#include <gen_cpp/AgentService_types.h>
#include <gen_cpp/BackendService.h>
#include <gen_cpp/HeartbeatService_types.h>
#include <gen_cpp/MasterService_types.h>
#include <gen_cpp/Status_types.h>
#include <gen_cpp/Types_constants.h>
#include <sys/stat.h>
#include <filesystem>
#include <memory>
#include <mutex>
#include <ostream>
#include <set>
#include <shared_mutex>
#include <system_error>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "common/config.h"
#include "common/logging.h"
#include "http/http_client.h"
#include "http/utils.h"
#include "io/fs/file_system.h"
#include "io/fs/local_file_system.h"
#include "io/fs/path.h"
#include "olap/data_dir.h"
#include "olap/olap_common.h"
#include "olap/olap_define.h"
#include "olap/pb_helper.h"
#include "olap/rowset/rowset.h"
#include "olap/snapshot_manager.h"
#include "olap/storage_engine.h"
#include "olap/tablet.h"
#include "olap/tablet_manager.h"
#include "olap/tablet_meta.h"
#include "runtime/client_cache.h"
#include "runtime/memory/mem_tracker_limiter.h"
#include "runtime/thread_context.h"
#include "util/debug_points.h"
#include "util/defer_op.h"
#include "util/network_util.h"
#include "util/security.h"
#include "util/stopwatch.hpp"
#include "util/thrift_rpc_helper.h"
#include "util/trace.h"
using std::set;
using std::stringstream;
namespace doris {
using namespace ErrorCode;
namespace {
/// if binlog file exist, then check if binlog file md5sum equal
/// if equal, then skip link file
/// if not equal, then return error
/// return value: if binlog file not exist, then return to binlog file path
Result<std::string> check_dest_binlog_valid(const std::string& tablet_dir,
const std::string& clone_dir,
const std::string& clone_file, bool* skip_link_file) {
std::string from, to;
std::string new_clone_file = clone_file;
if (clone_file.ends_with(".binlog")) {
// change clone_file suffix from .binlog to .dat
new_clone_file.replace(clone_file.size() - 7, 7, ".dat");
} else if (clone_file.ends_with(".binlog-index")) {
// change clone_file suffix from .binlog-index to .idx
new_clone_file.replace(clone_file.size() - 13, 13, ".idx");
}
from = fmt::format("{}/{}", clone_dir, clone_file);
to = fmt::format("{}/_binlog/{}", tablet_dir, new_clone_file);
// check to to file exist
bool exists = true;
auto status = io::global_local_filesystem()->exists(to, &exists);
if (!status.ok()) {
return ResultError(std::move(status));
}
if (!exists) {
return to;
}
LOG(WARNING) << "binlog file already exist. "
<< "tablet_dir=" << tablet_dir << ", clone_file=" << from << ", to=" << to;
std::string clone_file_md5sum;
status = io::global_local_filesystem()->md5sum(from, &clone_file_md5sum);
if (!status.ok()) {
return ResultError(std::move(status));
}
std::string to_file_md5sum;
status = io::global_local_filesystem()->md5sum(to, &to_file_md5sum);
if (!status.ok()) {
return ResultError(std::move(status));
}
if (clone_file_md5sum == to_file_md5sum) {
// if md5sum equal, then skip link file
*skip_link_file = true;
return to;
} else {
auto err_msg = fmt::format(
"binlog file already exist, but md5sum not equal. "
"tablet_dir={}, clone_file={}",
tablet_dir, clone_file);
LOG(WARNING) << err_msg;
return ResultError(Status::InternalError(std::move(err_msg)));
}
}
} // namespace
#define RETURN_IF_ERROR_(status, stmt) \
do { \
status = (stmt); \
if (UNLIKELY(!status.ok())) { \
return status; \
} \
} while (false)
EngineCloneTask::EngineCloneTask(StorageEngine& engine, const TCloneReq& clone_req,
const ClusterInfo* cluster_info, int64_t signature,
std::vector<TTabletInfo>* tablet_infos)
: _engine(engine),
_clone_req(clone_req),
_tablet_infos(tablet_infos),
_signature(signature),
_cluster_info(cluster_info) {
_mem_tracker = MemTrackerLimiter::create_shared(
MemTrackerLimiter::Type::OTHER,
"EngineCloneTask#tabletId=" + std::to_string(_clone_req.tablet_id));
}
Status EngineCloneTask::execute() {
// register the tablet to avoid it is deleted by gc thread during clone process
Status st = _do_clone();
_engine.tablet_manager()->update_partitions_visible_version(
{{_clone_req.partition_id, _clone_req.version}});
return st;
}
Status EngineCloneTask::_do_clone() {
DBUG_EXECUTE_IF("EngineCloneTask.wait_clone", {
auto duration = std::chrono::milliseconds(dp->param("duration", 10 * 1000));
std::this_thread::sleep_for(duration);
});
DBUG_EXECUTE_IF("EngineCloneTask.failed_clone", {
LOG_WARNING("EngineCloneTask.failed_clone")
.tag("tablet_id", _clone_req.tablet_id)
.tag("replica_id", _clone_req.replica_id)
.tag("version", _clone_req.version);
return Status::InternalError(
"in debug point, EngineCloneTask.failed_clone tablet={}, replica={}, version={}",
_clone_req.tablet_id, _clone_req.replica_id, _clone_req.version);
});
Status status = Status::OK();
std::string src_file_path;
TBackend src_host;
RETURN_IF_ERROR(
_engine.tablet_manager()->register_transition_tablet(_clone_req.tablet_id, "clone"));
Defer defer {[&]() {
_engine.tablet_manager()->unregister_transition_tablet(_clone_req.tablet_id, "clone");
}};
// Check local tablet exist or not
TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(_clone_req.tablet_id);
// The status of a tablet is not ready, indicating that it is a residual tablet after a schema
// change failure. Clone a new tablet from remote be to overwrite it. This situation basically only
// occurs when the be_rebalancer_fuzzy_test configuration is enabled.
if (tablet && tablet->tablet_state() == TABLET_NOTREADY) {
LOG(WARNING) << "tablet state is not ready when clone, need to drop old tablet, tablet_id="
<< tablet->tablet_id();
RETURN_IF_ERROR(_engine.tablet_manager()->drop_tablet(tablet->tablet_id(),
tablet->replica_id(), false));
tablet.reset();
}
_is_new_tablet = tablet == nullptr;
// try to incremental clone
Versions missed_versions;
// try to repair a tablet with missing version
if (tablet != nullptr) {
std::shared_lock migration_rlock(tablet->get_migration_lock(), std::try_to_lock);
if (!migration_rlock.owns_lock()) {
return Status::Error<TRY_LOCK_FAILED>(
"EngineCloneTask::_do_clone meet try lock failed");
}
if (tablet->replica_id() < _clone_req.replica_id) {
// `tablet` may be a dropped replica in FE, e.g:
// BE1 migrates replica of tablet_1 to BE2, but before BE1 drop this replica, another new replica of tablet_1 is migrated to BE1.
// Clone can still continue in this case. But to keep `replica_id` consitent with FE, MUST reset `replica_id` with request `replica_id`.
tablet->tablet_meta()->set_replica_id(_clone_req.replica_id);
}
// get download path
auto local_data_path = fmt::format("{}/{}", tablet->tablet_path(), CLONE_PREFIX);
bool allow_incremental_clone = false;
int64_t specified_version = _clone_req.version;
if (tablet->enable_unique_key_merge_on_write()) {
int64_t min_pending_ver = _engine.get_pending_publish_min_version(tablet->tablet_id());
if (min_pending_ver - 1 < specified_version) {
LOG(INFO) << "use min pending publish version for clone, min_pending_ver: "
<< min_pending_ver << " visible_version: " << _clone_req.version;
specified_version = min_pending_ver - 1;
}
}
missed_versions = tablet->get_missed_versions(specified_version);
// if missed version size is 0, then it is useless to clone from remote be, it means local data is
// completed. Or remote be will just return header not the rowset files. clone will failed.
if (missed_versions.empty()) {
LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id="
<< _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id;
RETURN_IF_ERROR(_set_tablet_info());
return Status::OK();
}
LOG(INFO) << "clone to existed tablet. missed_versions_size=" << missed_versions.size()
<< ", allow_incremental_clone=" << allow_incremental_clone
<< ", signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id
<< ", visible_version=" << _clone_req.version
<< ", replica_id=" << _clone_req.replica_id;
// try to download missing version from src backend.
// if tablet on src backend does not contains missing version, it will download all versions,
// and set allow_incremental_clone to false
RETURN_IF_ERROR(_make_and_download_snapshots(*(tablet->data_dir()), local_data_path,
&src_host, &src_file_path, missed_versions,
&allow_incremental_clone));
RETURN_IF_ERROR(_finish_clone(tablet.get(), local_data_path, specified_version,
allow_incremental_clone));
} else {
LOG(INFO) << "clone tablet not exist, begin clone a new tablet from remote be. "
<< "signature=" << _signature << ", tablet_id=" << _clone_req.tablet_id
<< ", visible_version=" << _clone_req.version
<< ", req replica=" << _clone_req.replica_id;
// create a new tablet in this be
// Get local disk from olap
std::string local_shard_root_path;
DataDir* store = nullptr;
RETURN_IF_ERROR(_engine.obtain_shard_path(_clone_req.storage_medium,
_clone_req.dest_path_hash, &local_shard_root_path,
&store, _clone_req.partition_id));
auto tablet_dir = fmt::format("{}/{}/{}", local_shard_root_path, _clone_req.tablet_id,
_clone_req.schema_hash);
Defer remove_useless_dir {[&] {
if (status.ok()) {
return;
}
LOG(INFO) << "clone failed. want to delete local dir: " << tablet_dir
<< ". signature: " << _signature;
WARN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir),
"failed to delete useless clone dir ");
WARN_IF_ERROR(DataDir::delete_tablet_parent_path_if_empty(tablet_dir),
"failed to delete parent dir");
}};
bool exists = true;
Status exists_st = io::global_local_filesystem()->exists(tablet_dir, &exists);
if (!exists_st) {
LOG(WARNING) << "cant get path=" << tablet_dir << " state, st=" << exists_st;
return exists_st;
}
if (exists) {
LOG(WARNING) << "before clone dest path=" << tablet_dir << " exist, remove it first";
RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(tablet_dir));
}
bool allow_incremental_clone = false;
RETURN_IF_ERROR_(status,
_make_and_download_snapshots(*store, tablet_dir, &src_host, &src_file_path,
missed_versions, &allow_incremental_clone));
LOG(INFO) << "clone copy done. src_host: " << src_host.host
<< " src_file_path: " << src_file_path;
auto tablet_manager = _engine.tablet_manager();
RETURN_IF_ERROR_(status, tablet_manager->load_tablet_from_dir(store, _clone_req.tablet_id,
_clone_req.schema_hash,
tablet_dir, false));
auto nested_tablet = tablet_manager->get_tablet(_clone_req.tablet_id);
if (!nested_tablet) {
status = Status::NotFound("tablet not found, tablet_id={}", _clone_req.tablet_id);
return status;
}
// MUST reset `replica_id` to request `replica_id` to keep consistent with FE
nested_tablet->tablet_meta()->set_replica_id(_clone_req.replica_id);
// clone success, delete .hdr file because tablet meta is stored in rocksdb
std::string header_path =
TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id);
RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path));
}
return _set_tablet_info();
}
Status EngineCloneTask::_set_tablet_info() {
// Get clone tablet info
TTabletInfo tablet_info;
tablet_info.__set_tablet_id(_clone_req.tablet_id);
tablet_info.__set_replica_id(_clone_req.replica_id);
tablet_info.__set_schema_hash(_clone_req.schema_hash);
RETURN_IF_ERROR(_engine.tablet_manager()->report_tablet_info(&tablet_info));
if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) {
// if it is a new tablet and clone failed, then remove the tablet
// if it is incremental clone, then must not drop the tablet
if (_is_new_tablet) {
// we need to check if this cloned table's version is what we expect.
// if not, maybe this is a stale remaining table which is waiting for drop.
// we drop it.
LOG(WARNING) << "begin to drop the stale tablet. tablet_id:" << _clone_req.tablet_id
<< ", replica_id:" << _clone_req.replica_id
<< ", schema_hash:" << _clone_req.schema_hash
<< ", signature:" << _signature << ", version:" << tablet_info.version
<< ", expected_version: " << _clone_req.version;
WARN_IF_ERROR(_engine.tablet_manager()->drop_tablet(_clone_req.tablet_id,
_clone_req.replica_id, false),
"drop stale cloned table failed");
}
return Status::InternalError("unexpected version. tablet version: {}, expected version: {}",
tablet_info.version, _clone_req.version);
}
LOG(INFO) << "clone get tablet info success. tablet_id:" << _clone_req.tablet_id
<< ", schema_hash:" << _clone_req.schema_hash << ", signature:" << _signature
<< ", replica id:" << _clone_req.replica_id << ", version:" << tablet_info.version;
_tablet_infos->push_back(tablet_info);
return Status::OK();
}
/// This method will do following things:
/// 1. Make snapshots on source BE.
/// 2. Download all snapshots to CLONE dir.
/// 3. Convert rowset ids of downloaded snapshots(would also change the replica id).
/// 4. Release the snapshots on source BE.
Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir,
const std::string& local_data_path,
TBackend* src_host, std::string* snapshot_path,
const std::vector<Version>& missed_versions,
bool* allow_incremental_clone) {
Status status;
const auto& token = _cluster_info->token;
int timeout_s = 0;
if (_clone_req.__isset.timeout_s) {
timeout_s = _clone_req.timeout_s;
}
for (auto&& src : _clone_req.src_backends) {
// Make snapshot in remote olap engine
*src_host = src;
// make snapshot
status = _make_snapshot(src.host, src.be_port, _clone_req.tablet_id, _clone_req.schema_hash,
timeout_s, missed_versions, snapshot_path, allow_incremental_clone);
if (!status.ok()) [[unlikely]] {
LOG_WARNING("failed to make snapshot in remote BE")
.tag("host", src.host)
.tag("port", src.be_port)
.tag("tablet", _clone_req.tablet_id)
.tag("signature", _signature)
.tag("missed_versions", missed_versions)
.error(status);
continue; // Try another BE
}
LOG_INFO("successfully make snapshot in remote BE")
.tag("host", src.host)
.tag("port", src.be_port)
.tag("tablet", _clone_req.tablet_id)
.tag("snapshot_path", *snapshot_path)
.tag("signature", _signature)
.tag("missed_versions", missed_versions);
Defer defer {[host = src.host, port = src.be_port, &snapshot_path = *snapshot_path, this] {
// TODO(plat1ko): Async release snapshot
auto st = _release_snapshot(host, port, snapshot_path);
if (!st.ok()) [[unlikely]] {
LOG_WARNING("failed to release snapshot in remote BE")
.tag("host", host)
.tag("port", port)
.tag("snapshot_path", snapshot_path)
.error(st);
}
}};
std::string remote_dir;
{
std::stringstream ss;
if (snapshot_path->back() == '/') {
ss << *snapshot_path << _clone_req.tablet_id << "/" << _clone_req.schema_hash
<< "/";
} else {
ss << *snapshot_path << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash
<< "/";
}
remote_dir = ss.str();
}
std::string address = get_host_port(src.host, src.http_port);
if (config::enable_batch_download && is_support_batch_download(address).ok()) {
// download files via batch api.
LOG_INFO("remote BE supports batch download, use batch file download")
.tag("address", address)
.tag("remote_dir", remote_dir);
status = _batch_download_files(&data_dir, address, remote_dir, local_data_path);
if (!status.ok()) [[unlikely]] {
LOG_WARNING("failed to download snapshot from remote BE in batch")
.tag("address", address)
.tag("remote_dir", remote_dir)
.error(status);
continue; // Try another BE
}
} else {
if (config::enable_batch_download) {
LOG_INFO("remote BE does not support batch download, use single file download")
.tag("address", address)
.tag("remote_dir", remote_dir);
} else {
LOG_INFO("batch download is disabled, use single file download")
.tag("address", address)
.tag("remote_dir", remote_dir);
}
std::string remote_url_prefix;
{
std::stringstream ss;
ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM
<< token << HTTP_REQUEST_FILE_PARAM << remote_dir;
remote_url_prefix = ss.str();
}
status = _download_files(&data_dir, remote_url_prefix, local_data_path);
if (!status.ok()) [[unlikely]] {
LOG_WARNING("failed to download snapshot from remote BE")
.tag("url", mask_token(remote_url_prefix))
.error(status);
continue; // Try another BE
}
}
// No need to try again with another BE
_pending_rs_guards = DORIS_TRY(_engine.snapshot_mgr()->convert_rowset_ids(
local_data_path, _clone_req.tablet_id, _clone_req.replica_id, _clone_req.table_id,
_clone_req.partition_id, _clone_req.schema_hash));
break;
} // clone copy from one backend
return status;
}
Status EngineCloneTask::_make_snapshot(const std::string& ip, int port, TTableId tablet_id,
TSchemaHash schema_hash, int timeout_s,
const std::vector<Version>& missed_versions,
std::string* snapshot_path, bool* allow_incremental_clone) {
TSnapshotRequest request;
request.__set_tablet_id(tablet_id);
request.__set_schema_hash(schema_hash);
request.__set_preferred_snapshot_version(g_Types_constants.TPREFER_SNAPSHOT_REQ_VERSION);
request.__set_version(_clone_req.version);
request.__set_is_copy_binlog(true);
// TODO: missing version composed of singleton delta.
// if not, this place should be rewrote.
// we make every TSnapshotRequest sent from be with __isset.missing_version = true
// then if one be received one req with __isset.missing_version = false it means
// this req is sent from FE(FE would never set this field)
request.__isset.missing_version = true;
for (auto& version : missed_versions) {
request.missing_version.push_back(version.first);
}
if (timeout_s > 0) {
request.__set_timeout(timeout_s);
}
TAgentResult result;
RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>(
ip, port, [&request, &result](BackendServiceConnection& client) {
client->make_snapshot(result, request);
}));
if (result.status.status_code != TStatusCode::OK) {
return Status::create(result.status);
}
if (!result.__isset.snapshot_path) {
return Status::InternalError("success snapshot request without snapshot path");
}
*snapshot_path = result.snapshot_path;
if (snapshot_path->at(snapshot_path->length() - 1) != '/') {
snapshot_path->append("/");
}
if (result.__isset.allow_incremental_clone) {
// During upgrading, some BE nodes still be installed an old previous old.
// which incremental clone is not ready in those nodes.
// should add a symbol to indicate it.
*allow_incremental_clone = result.allow_incremental_clone;
}
return Status::OK();
}
Status EngineCloneTask::_release_snapshot(const std::string& ip, int port,
const std::string& snapshot_path) {
TAgentResult result;
RETURN_IF_ERROR(ThriftRpcHelper::rpc<BackendServiceClient>(
ip, port, [&snapshot_path, &result](BackendServiceConnection& client) {
client->release_snapshot(result, snapshot_path);
}));
return Status::create(result.status);
}
Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& remote_url_prefix,
const std::string& local_path) {
// Check local path exist, if exist, remove it, then create the dir
// local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder
// if this folder exists, then should remove it
// for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed
// then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same
// name may have different versions.
RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path));
RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path));
// Get remote dir file list
std::string file_list_str;
auto list_files_cb = [&remote_url_prefix, &file_list_str](HttpClient* client) {
RETURN_IF_ERROR(client->init(remote_url_prefix));
client->set_timeout_ms(LIST_REMOTE_FILE_TIMEOUT * 1000);
return client->execute(&file_list_str);
};
RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, list_files_cb));
std::vector<std::string> file_name_list =
absl::StrSplit(file_list_str, "\n", absl::SkipWhitespace());
// If the header file is not exist, the table couldn't loaded by olap engine.
// Avoid of data is not complete, we copy the header file at last.
// The header file's name is end of .hdr.
for (int i = 0; i + 1 < file_name_list.size(); ++i) {
if (file_name_list[i].ends_with(".hdr")) {
std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]);
break;
}
}
// Get copy from remote
uint64_t total_file_size = 0;
MonotonicStopWatch watch;
watch.start();
for (auto& file_name : file_name_list) {
auto remote_file_url = remote_url_prefix + file_name;
// get file length
uint64_t file_size = 0;
auto get_file_size_cb = [&remote_file_url, &file_size](HttpClient* client) {
RETURN_IF_ERROR(client->init(remote_file_url));
client->set_timeout_ms(GET_LENGTH_TIMEOUT * 1000);
RETURN_IF_ERROR(client->head());
RETURN_IF_ERROR(client->get_content_length(&file_size));
return Status::OK();
};
RETURN_IF_ERROR(
HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, get_file_size_cb));
// check disk capacity
if (data_dir->reach_capacity_limit(file_size)) {
return Status::Error<EXCEEDED_LIMIT>(
"reach the capacity limit of path {}, file_size={}", data_dir->path(),
file_size);
}
total_file_size += file_size;
uint64_t estimate_timeout = file_size / config::download_low_speed_limit_kbps / 1024;
if (estimate_timeout < config::download_low_speed_time) {
estimate_timeout = config::download_low_speed_time;
}
std::string local_file_path = local_path + "/" + file_name;
LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url)
<< " to: " << local_file_path << ". size(B): " << file_size
<< ", timeout(s): " << estimate_timeout;
auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path,
file_size](HttpClient* client) {
RETURN_IF_ERROR(client->init(remote_file_url));
client->set_timeout_ms(estimate_timeout * 1000);
RETURN_IF_ERROR(client->download(local_file_path));
std::error_code ec;
// Check file length
uint64_t local_file_size = std::filesystem::file_size(local_file_path, ec);
if (ec) {
LOG(WARNING) << "download file error" << ec.message();
return Status::IOError("can't retrive file_size of {}, due to {}", local_file_path,
ec.message());
}
if (local_file_size != file_size) {
LOG(WARNING) << "download file length error"
<< ", remote_path=" << mask_token(remote_file_url)
<< ", file_size=" << file_size
<< ", local_file_size=" << local_file_size;
return Status::InternalError("downloaded file size is not equal");
}
return io::global_local_filesystem()->permission(local_file_path,
io::LocalFileSystem::PERMS_OWNER_RW);
};
RETURN_IF_ERROR(HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, download_cb));
} // Clone files from remote backend
uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000;
total_time_ms = total_time_ms > 0 ? total_time_ms : 0;
double copy_rate = 0.0;
if (total_time_ms > 0) {
copy_rate = total_file_size / ((double)total_time_ms) / 1000;
}
_copy_size = (int64_t)total_file_size;
_copy_time_ms = (int64_t)total_time_ms;
LOG(INFO) << "succeed to copy tablet " << _signature
<< ", total files: " << file_name_list.size()
<< ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms"
<< ", rate: " << copy_rate << " MB/s";
return Status::OK();
}
Status EngineCloneTask::_batch_download_files(DataDir* data_dir, const std::string& address,
const std::string& remote_dir,
const std::string& local_dir) {
constexpr size_t BATCH_FILE_SIZE = 64 << 20; // 64MB
constexpr size_t BATCH_FILE_NUM = 64;
// Check local path exist, if exist, remove it, then create the dir
// local_file_full_path = tabletid/clone, for a specific tablet, there should be only one folder
// if this folder exists, then should remove it
// for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed
// then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same
// name may have different versions.
RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_dir));
RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_dir));
const std::string& token = _cluster_info->token;
std::vector<std::pair<std::string, size_t>> file_info_list;
RETURN_IF_ERROR(list_remote_files_v2(address, token, remote_dir, &file_info_list));
// If the header file is not exist, the table couldn't loaded by olap engine.
// Avoid of data is not complete, we copy the header file at last.
// The header file's name is end of .hdr.
for (int i = 0; i + 1 < file_info_list.size(); ++i) {
if (file_info_list[i].first.ends_with(".hdr")) {
std::swap(file_info_list[i], file_info_list[file_info_list.size() - 1]);
break;
}
}
MonotonicStopWatch watch;
watch.start();
size_t total_file_size = 0;
size_t total_files = file_info_list.size();
std::vector<std::pair<std::string, size_t>> batch_files;
for (size_t i = 0; i < total_files;) {
size_t batch_file_size = 0;
for (size_t j = i; j < total_files; j++) {
// Split batchs by file number and file size,
if (BATCH_FILE_NUM <= batch_files.size() || BATCH_FILE_SIZE <= batch_file_size ||
// ... or separate the last .hdr file into a single batch.
(j + 1 == total_files && !batch_files.empty())) {
break;
}
batch_files.push_back(file_info_list[j]);
batch_file_size += file_info_list[j].second;
}
// check disk capacity
if (data_dir->reach_capacity_limit(batch_file_size)) {
return Status::Error<EXCEEDED_LIMIT>(
"reach the capacity limit of path {}, file_size={}", data_dir->path(),
batch_file_size);
}
RETURN_IF_ERROR(download_files_v2(address, token, remote_dir, local_dir, batch_files));
total_file_size += batch_file_size;
i += batch_files.size();
batch_files.clear();
}
uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000;
total_time_ms = total_time_ms > 0 ? total_time_ms : 0;
double copy_rate = 0.0;
if (total_time_ms > 0) {
copy_rate = total_file_size / ((double)total_time_ms) / 1000;
}
_copy_size = (int64_t)total_file_size;
_copy_time_ms = (int64_t)total_time_ms;
LOG(INFO) << "succeed to copy tablet " << _signature
<< ", total files: " << file_info_list.size()
<< ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms"
<< ", rate: " << copy_rate << " MB/s";
return Status::OK();
}
/// This method will only be called if tablet already exist in this BE when doing clone.
/// This method will do the following things:
/// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir
/// 2. Call _finish_xx_clone() to revise the tablet meta.
Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version,
bool is_incremental_clone) {
Defer remove_clone_dir {[&]() {
std::error_code ec;
std::filesystem::remove_all(clone_dir, ec);
if (ec) {
LOG(WARNING) << "failed to remove=" << clone_dir << " msg=" << ec.message();
}
}};
// check clone dir existed
bool exists = true;
RETURN_IF_ERROR(io::global_local_filesystem()->exists(clone_dir, &exists));
if (!exists) {
return Status::InternalError("clone dir not existed. clone_dir={}", clone_dir);
}
// Load src header.
// The tablet meta info is downloaded from source BE as .hdr file.
// So we load it and generate cloned_tablet_meta.
auto cloned_tablet_meta_file = fmt::format("{}/{}.hdr", clone_dir, tablet->tablet_id());
auto cloned_tablet_meta = std::make_shared<TabletMeta>();
RETURN_IF_ERROR(cloned_tablet_meta->create_from_file(cloned_tablet_meta_file));
// remove the cloned meta file
RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(cloned_tablet_meta_file));
// remove rowset binlog metas
const auto& tablet_dir = tablet->tablet_path();
auto binlog_metas_file = fmt::format("{}/rowset_binlog_metas.pb", clone_dir);
bool binlog_metas_file_exists = false;
auto file_exists_status =
io::global_local_filesystem()->exists(binlog_metas_file, &binlog_metas_file_exists);
if (!file_exists_status.ok()) {
return file_exists_status;
}
bool contain_binlog = false;
RowsetBinlogMetasPB rowset_binlog_metas_pb;
if (binlog_metas_file_exists) {
std::error_code ec;
auto binlog_meta_filesize = std::filesystem::file_size(binlog_metas_file, ec);
if (ec) {
LOG(WARNING) << "get file size error" << ec.message();
return Status::IOError("can't retrive file_size of {}, due to {}", binlog_metas_file,
ec.message());
}
if (binlog_meta_filesize > 0) {
contain_binlog = true;
RETURN_IF_ERROR(read_pb(binlog_metas_file, &rowset_binlog_metas_pb));
}
RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(binlog_metas_file));
}
if (contain_binlog) {
auto binlog_dir = fmt::format("{}/_binlog", tablet_dir);
RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(binlog_dir));
}
// check all files in /clone and /tablet
std::vector<io::FileInfo> clone_files;
RETURN_IF_ERROR(io::global_local_filesystem()->list(clone_dir, true, &clone_files, &exists));
std::unordered_set<std::string> clone_file_names;
for (auto& file : clone_files) {
clone_file_names.insert(file.file_name);
}
std::vector<io::FileInfo> local_files;
RETURN_IF_ERROR(io::global_local_filesystem()->list(tablet_dir, true, &local_files, &exists));
std::unordered_set<std::string> local_file_names;
for (auto& file : local_files) {
local_file_names.insert(file.file_name);
}
Status status;
std::vector<std::string> linked_success_files;
Defer remove_linked_files {[&]() { // clear linked files if errors happen
if (!status.ok()) {
std::vector<io::Path> paths;
for (auto& file : linked_success_files) {
paths.emplace_back(file);
}
static_cast<void>(io::global_local_filesystem()->batch_delete(paths));
}
}};
/// Traverse all downloaded clone files in CLONE dir.
/// If it does not exist in local tablet dir, link the file to local tablet dir
/// And save all linked files in linked_success_files.
for (const std::string& clone_file : clone_file_names) {
if (local_file_names.find(clone_file) != local_file_names.end()) {
VLOG_NOTICE << "find same file when clone, skip it. "
<< "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file;
continue;
}
/// if binlog exist in clone dir and md5sum equal, then skip link file
bool skip_link_file = false;
std::string to;
if (clone_file.ends_with(".binlog") || clone_file.ends_with(".binlog-index")) {
if (!contain_binlog) {
LOG(WARNING) << "clone binlog file, but not contain binlog metas. "
<< "tablet=" << tablet->tablet_id() << ", clone_file=" << clone_file;
break;
}
if (auto&& result =
check_dest_binlog_valid(tablet_dir, clone_dir, clone_file, &skip_link_file);
result) {
to = std::move(result.value());
} else {
status = std::move(result.error());
return status;
}
} else {
to = fmt::format("{}/{}", tablet_dir, clone_file);
}
if (!skip_link_file) {
auto from = fmt::format("{}/{}", clone_dir, clone_file);
status = io::global_local_filesystem()->link_file(from, to);
if (!status.ok()) {
return status;
}
linked_success_files.emplace_back(std::move(to));
}
}
if (contain_binlog) {
status = tablet->ingest_binlog_metas(&rowset_binlog_metas_pb);
if (!status.ok()) {
return status;
}
}
// clone and compaction operation should be performed sequentially
std::lock_guard base_compaction_lock(tablet->get_base_compaction_lock());
std::lock_guard cumulative_compaction_lock(tablet->get_cumulative_compaction_lock());
std::lock_guard cold_compaction_lock(tablet->get_cold_compaction_lock());
std::lock_guard build_inverted_index_lock(tablet->get_build_inverted_index_lock());
std::lock_guard<std::mutex> push_lock(tablet->get_push_lock());
std::lock_guard<std::mutex> rwlock(tablet->get_rowset_update_lock());
std::lock_guard<std::shared_mutex> wrlock(tablet->get_header_lock());
SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD);
if (is_incremental_clone) {
status = _finish_incremental_clone(tablet, cloned_tablet_meta, version);
} else {
status = _finish_full_clone(tablet, cloned_tablet_meta);
}
// if full clone success, need to update cumulative layer point
if (!is_incremental_clone && status.ok()) {
tablet->set_cumulative_layer_point(Tablet::K_INVALID_CUMULATIVE_POINT);
}
// clear clone dir
return status;
}
/// This method will do:
/// 1. Get missing version from local tablet again and check if they exist in cloned tablet.
/// 2. Revise the local tablet meta to add all incremental cloned rowset's meta.
Status EngineCloneTask::_finish_incremental_clone(Tablet* tablet,
const TabletMetaSharedPtr& cloned_tablet_meta,
int64_t version) {
LOG(INFO) << "begin to finish incremental clone. tablet=" << tablet->tablet_id()
<< ", visible_version=" << version
<< ", cloned_tablet_replica_id=" << cloned_tablet_meta->replica_id();
/// Get missing versions again from local tablet.
/// We got it before outside the lock, so it has to be got again.
Versions missed_versions = tablet->get_missed_versions_unlocked(version);
VLOG_NOTICE << "get missed versions again when finish incremental clone. "
<< "tablet=" << tablet->tablet_id() << ", clone version=" << version
<< ", missed_versions_size=" << missed_versions.size();
// check missing versions exist in clone src
std::vector<RowsetSharedPtr> rowsets_to_clone;
for (Version nested_version : missed_versions) {
auto rs_meta = cloned_tablet_meta->acquire_rs_meta_by_version(nested_version);
if (rs_meta == nullptr) {
return Status::InternalError("missed version {} is not found in cloned tablet meta",
nested_version.to_string());
}
RowsetSharedPtr rs;
RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
rowsets_to_clone.push_back(std::move(rs));
}
/// clone_data to tablet
/// For incremental clone, nothing will be deleted.
/// So versions_to_delete is empty.
return tablet->revise_tablet_meta(rowsets_to_clone, {}, true);
}
/// This method will do:
/// 1. Compare the version of local tablet and cloned tablet to decide which version to keep
/// 2. Revise the local tablet meta
Status EngineCloneTask::_finish_full_clone(Tablet* tablet,
const TabletMetaSharedPtr& cloned_tablet_meta) {
Version cloned_max_version = cloned_tablet_meta->max_version();
LOG(INFO) << "begin to finish full clone. tablet=" << tablet->tablet_id()
<< ", cloned_max_version=" << cloned_max_version;
// Compare the version of local tablet and cloned tablet.
// For example:
// clone version is 8
//
// local tablet: [0-1] [2-5] [6-6] [7-7] [9-10]
// clone tablet: [0-1] [2-4] [5-6] [7-8]
//
// after compare, the version mark with "x" will be deleted
//
// local tablet: [0-1]x [2-5]x [6-6]x [7-7]x [9-10]
// clone tablet: [0-1] [2-4] [5-6] [7-8]
std::vector<RowsetSharedPtr> to_delete;
std::vector<RowsetSharedPtr> to_add;
for (auto& [v, rs] : tablet->rowset_map()) {
// if local version cross src latest, clone failed
// if local version is : 0-0, 1-1, 2-10, 12-14, 15-15,16-16
// cloned max version is 13-13, this clone is failed, because could not
// fill local data by using cloned data.
// It should not happen because if there is a hole, the following delta will not
// do compaction.
if (v.first <= cloned_max_version.second && v.second > cloned_max_version.second) {
return Status::InternalError(
"version cross src latest. cloned_max_version={}, local_version={}",
cloned_max_version.second, v.to_string());
}
if (v.second <= cloned_max_version.second) {
to_delete.push_back(rs);
} else {
// cooldowned rowsets MUST be continuous, so rowsets whose version > missed version MUST be local rowset
DCHECK(rs->is_local());
}
}
to_add.reserve(cloned_tablet_meta->all_rs_metas().size());
for (const auto& [_, rs_meta] : cloned_tablet_meta->all_rs_metas()) {
RowsetSharedPtr rs;
RETURN_IF_ERROR(tablet->create_rowset(rs_meta, &rs));
to_add.push_back(std::move(rs));
}
{
std::shared_lock cooldown_conf_rlock(tablet->get_cooldown_conf_lock());
if (tablet->cooldown_conf_unlocked().cooldown_replica_id == tablet->replica_id()) {
// If this replica is cooldown replica, MUST generate a new `cooldown_meta_id` to avoid use `cooldown_meta_id`
// generated in old cooldown term which may lead to such situation:
// Replica A is cooldown replica, cooldown_meta_id=2,
// Replica B: cooldown_replica=A, cooldown_meta_id=1
// Replica A: full clone Replica A, cooldown_meta_id=1, but remote cooldown_meta is still with cooldown_meta_id=2
// After tablet report. FE finds all replicas' cooldowned data is consistent
// Replica A: confirm_unused_remote_files, delete some cooldowned data of cooldown_meta_id=2
// Replica B: follow_cooldown_data, cooldown_meta_id=2, data lost
tablet->tablet_meta()->set_cooldown_meta_id(UniqueId::gen_uid());
} else {
tablet->tablet_meta()->set_cooldown_meta_id(cloned_tablet_meta->cooldown_meta_id());
}
}
if (tablet->enable_unique_key_merge_on_write()) {
tablet->tablet_meta()->delete_bitmap().merge(cloned_tablet_meta->delete_bitmap());
}
return tablet->revise_tablet_meta(to_add, to_delete, false);
// TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica
}
} // namespace doris