blob: fd6a5d03c0f8688b9330aaca08001759e3b75fae [file] [log] [blame]
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# 1. Run fdb_ctrl.sh deploy on each machine to deploy FoundationDB.
# This will create the necessary directories, configuration files.
#
# 2. Run fdb_ctrl.sh start on each machine to start the fdb cluster
# and get the cluster connection string.
#
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" &>/dev/null && pwd)"
if [[ -f "${ROOT_DIR}/fdb_vars.sh" ]]; then
source "${ROOT_DIR}/fdb_vars.sh"
else
echo "Please create fdb_vars.sh first"
exit 1
fi
if [[ ! -d "${FDB_HOME}" ]]; then
echo "Please set and create FDB_HOME:${FDB_HOME} first"
exit 1
fi
if [[ ! "${FDB_HOME}" = /* ]]; then
echo "${FDB_HOME} is not an absolute path."
exit 1
fi
if [[ -z ${FDB_CLUSTER_ID} ]]; then
echo "Please set FDB_CLUSTER_ID first"
exit 1
fi
# TODO verify config
FDB_CLUSTER_DESC=${FDB_CLUSTER_DESC:-"doris-fdb"}
# A dir to provide FDB binary pkgs
FDB_PKG_DIR=${ROOT_DIR}/pkgs/${FDB_VERSION}
FDB_PORT=${FDB_PORT:-4500}
LOG_DIR=${LOG_DIR:-${FDB_HOME}/log}
mkdir -p "${LOG_DIR}"
mkdir -p "${FDB_HOME}"/conf
mkdir -p "${FDB_HOME}"/log
function ensure_port_is_listenable() {
local component="$1"
local port="$2"
if lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null; then
echo "The port ${port} of ${component} is occupied"
exit 1
fi
}
function download_fdb() {
if [[ -d "${FDB_PKG_DIR}" ]]; then
echo "FDB package for ${FDB_VERSION} already exists"
return
fi
arch=$(uname -m)
if [[ "${arch}" == "x86_64" ]]; then
local URL="https://github.com/apple/foundationdb/releases/download/${FDB_VERSION}/"
local TMP="${FDB_PKG_DIR}-tmp"
rm -rf "${TMP}"
mkdir -p "${TMP}"
wget "${URL}/fdbbackup.x86_64" -O "${TMP}/fdbbackup"
wget "${URL}/fdbserver.x86_64" -O "${TMP}/fdbserver"
wget "${URL}/fdbcli.x86_64" -O "${TMP}/fdbcli"
wget "${URL}/fdbmonitor.x86_64" -O "${TMP}/fdbmonitor"
wget "${URL}/libfdb_c.x86_64.so" -O "${TMP}/libfdb_c.x86_64.so"
elif [[ "${arch}" == "aarch64" ]]; then
local URL="https://doris-build.oss-cn-beijing.aliyuncs.com/thirdparty/fdb/aarch64"
local TMP="${FDB_PKG_DIR}-tmp"
rm -rf "${TMP}"
mkdir -p "${TMP}"
wget "${URL}/fdbbackup" -O "${TMP}/fdbbackup"
wget "${URL}/fdbserver" -O "${TMP}/fdbserver"
wget "${URL}/fdbcli" -O "${TMP}/fdbcli"
wget "${URL}/fdbmonitor" -O "${TMP}/fdbmonitor"
wget "${URL}/libfdb_c.aarch64.so" -O "${TMP}/libfdb_c.aarch64.so"
else
echo "Unsupported architecture: ""${arch}"
fi
chmod +x "${TMP}"/fdb*
mv "${TMP}" "${FDB_PKG_DIR}"
echo "Download fdb binary pkgs success"
}
# Function to configure coordinators
get_coordinators() {
local num_nodes
local num_coordinators
num_nodes=$(echo "${FDB_CLUSTER_IPS}" | tr ',' '\n' | wc -l)
if [[ ${num_nodes} -le 2 ]]; then
num_coordinators=1
elif [[ ${num_nodes} -le 4 ]]; then
num_coordinators=3
else
num_coordinators=5
fi
echo "${FDB_CLUSTER_IPS}" | cut -d',' -f1-"${num_coordinators}" | tr ',' '\n' | sed "s/$/:${FDB_PORT}/" | paste -sd ','
}
get_fdb_mode() {
# Initialize a new database
local num_nodes
local fdb_mode
num_nodes=$(echo "${FDB_CLUSTER_IPS}" | tr ',' '\n' | wc -l)
if [[ ${num_nodes} -eq 1 ]]; then
fdb_mode="single"
elif [[ ${num_nodes} -le 4 ]]; then
fdb_mode="double"
else
fdb_mode="triple"
fi
echo "${fdb_mode}"
}
# Function to calculate number of processes
calculate_process_numbers() {
local memory_limit_gb=$1
local cpu_cores_limit=$2
local data_dir_count
# Convert comma-separated DATA_DIRS into an array
IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}"
data_dir_count=${#DATA_DIR_ARRAY[@]}
# Parse the ratio input
IFS=':' read -r num_storage num_stateless num_log <<<"${STORAGE_STATELESS_LOG_RATIO}"
# Initialize process counts
local storage_processes=0 # Storage processes
local stateless_processes=0 # Stateless processes
local log_processes=0 # Log processes
local storage_process_num_limit=$((STORAGE_PROCESSES_NUM_PER_SSD * data_dir_count))
local log_process_num_limit=$((LOG_PROCESSES_NUM_PER_SSD * data_dir_count))
if [[ "#${MEDIUM_TYPE}" = "#HDD" ]]; then
storage_process_num_limit=$((STORAGE_PROCESSES_NUM_PER_HDD * data_dir_count))
log_process_num_limit=$((LOG_PROCESSES_NUM_PER_HDD * data_dir_count))
fi
# Find maximum number of processes while maintaining the specified ratio
while true; do
# Calculate process counts based on the ratio
storage_processes=$((storage_processes + num_storage))
stateless_processes=$((storage_processes * num_stateless / num_storage))
log_processes=$((storage_processes * num_log / num_storage))
# Calculate total CPUs used
local total_cpu_used=$((storage_processes + stateless_processes + log_processes))
# Check memory constraint
local total_memory_used=$(((MEMORY_STORAGE_GB * storage_processes) + (MEMORY_STATELESS_GB * stateless_processes) + (MEMORY_LOG_GB * log_processes)))
# Check datadir limits
if ((storage_processes > storage_process_num_limit || log_processes > log_process_num_limit)); then
break
fi
# Check overall constraints
if ((total_memory_used <= memory_limit_gb && total_cpu_used <= cpu_cores_limit)); then
continue
else
# If constraints are violated, revert back
storage_processes=$((storage_processes - num_storage))
stateless_processes=$((storage_processes * num_stateless / num_storage))
log_processes=$((storage_processes * num_log / num_storage))
break
fi
done
# Return the values
echo "${stateless_processes} ${storage_processes} ${log_processes}"
}
function check_vars() {
IFS=',' read -r -a IPS <<<"${FDB_CLUSTER_IPS}"
command -v ping || echo "ping is not available to check machines are available, please install ping."
for IP_ADDRESS in "${IPS[@]}"; do
if ping -c 1 "${IP_ADDRESS}" &>/dev/null; then
echo "${IP_ADDRESS} is reachable"
else
echo "${IP_ADDRESS} is not reachable"
exit 1
fi
done
if [[ ${CPU_CORES_LIMIT} -gt $(nproc) ]]; then
echo "CPU_CORES_LIMIT beyonds number of machine, which is $(nproc)"
exit 1
fi
if [[ ${MEMORY_LIMIT_GB} -gt $(free -g | awk '/^Mem:/{print $2}') ]]; then
echo "MEMORY_LIMIT_GB beyonds memory of machine, which is $(free -g | awk '/^Mem:/{print $2}')"
exit 1
fi
}
function deploy_fdb() {
check_vars
download_fdb
check_fdb_running
ln -sf "${FDB_PKG_DIR}/fdbserver" "${FDB_HOME}/fdbserver"
ln -sf "${FDB_PKG_DIR}/fdbmonitor" "${FDB_HOME}/fdbmonitor"
ln -sf "${FDB_PKG_DIR}/fdbbackup" "${FDB_HOME}/backup_agent"
ln -sf "${FDB_PKG_DIR}/fdbcli" "${FDB_HOME}/fdbcli"
CLUSTER_DESC="${FDB_CLUSTER_DESC:-${FDB_CLUSTER_ID}}"
# Convert comma-separated DATA_DIRS into an array
IFS=',' read -r -a DATA_DIR_ARRAY <<<"${DATA_DIRS}"
for DIR in "${DATA_DIR_ARRAY[@]}"; do
mkdir -p "${DIR}" || handle_error "Failed to create data directory ${DIR}"
if [[ -n "$(ls -A "${DIR}")" ]]; then
echo "Error: ${DIR} is not empty. DO NOT run deploy on a node running fdb. If you are sure that the node is not in a fdb cluster, run fdb_ctl.sh clean."
exit 1
fi
done
echo -e "\tCreate fdb.cluster, coordinator: $(get_coordinators)"
echo -e "\tfdb.cluster content is: ${CLUSTER_DESC}:${FDB_CLUSTER_ID}@$(get_coordinators)"
cat >"${FDB_HOME}/conf/fdb.cluster" <<EOF
${CLUSTER_DESC}:${FDB_CLUSTER_ID}@$(get_coordinators)
EOF
GROUP_NAME="$(id -gn 2>/dev/null || echo "${USER}")"
cat >"${FDB_HOME}/conf/fdb.conf" <<EOF
[fdbmonitor]
user = ${USER}
group = ${GROUP_NAME}
[general]
restart-delay = 60
cluster-file = ${FDB_HOME}/conf/fdb.cluster
## Default parameters for individual fdbserver processes
[fdbserver]
command = ${FDB_HOME}/fdbserver
public-address = auto:\$ID
listen-address = public
logdir = ${LOG_DIR}
datadir = ${DATA_DIR_ARRAY[0]}/\$ID
EOF
# Read configuration values
MEMORY_LIMIT_GB=${MEMORY_LIMIT_GB:-8}
CPU_CORES_LIMIT=${CPU_CORES_LIMIT:-1}
# Calculate number of processes based on resources and data directories
read -r stateless_processes storage_processes log_processes <<<"$(calculate_process_numbers "${MEMORY_LIMIT_GB}" "${CPU_CORES_LIMIT}")"
echo "stateless process num : ${stateless_processes}, storage_processes : ${storage_processes}, log_processes : ${log_processes}"
if [[ ${storage_processes} -eq 0 ]]; then
# Add one process
PORT=$((FDB_PORT))
echo "[fdbserver.${PORT}]
" >>"${FDB_HOME}/conf/fdb.conf"
fi
# Add stateless processes
for ((i = 0; i < stateless_processes; i++)); do
PORT=$((FDB_PORT + i))
echo "[fdbserver.${PORT}]
class = stateless" >>"${FDB_HOME}/conf/fdb.conf"
done
FDB_PORT=$((FDB_PORT + stateless_processes))
# Add storage processes
STORAGE_DIR_COUNT=${#DATA_DIR_ARRAY[@]}
for ((i = 0; i < storage_processes; i++)); do
PORT=$((FDB_PORT + i))
DIR_INDEX=$((i % STORAGE_DIR_COUNT))
echo "[fdbserver.${PORT}]
class = storage
datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fdb.conf" >/dev/null
done
FDB_PORT=$((FDB_PORT + storage_processes))
# Add log processes
for ((i = 0; i < log_processes; i++)); do
PORT=$((FDB_PORT + i))
DIR_INDEX=$((i % STORAGE_DIR_COUNT))
echo "[fdbserver.${PORT}]
class = log
datadir = ${DATA_DIR_ARRAY[${DIR_INDEX}]}/${PORT}" | tee -a "${FDB_HOME}/conf/fdb.conf" >/dev/null
done
echo "[backup_agent]
command = ${FDB_HOME}/backup_agent
logdir = ${LOG_DIR}" >>"${FDB_HOME}/conf/fdb.conf"
echo "Deploy FDB to: ${FDB_HOME}"
}
function start_fdb() {
check_fdb_running
if [[ ! -f "${FDB_HOME}/fdbmonitor" ]]; then
echo 'Please run setup before start fdb server'
exit 1
fi
ensure_port_is_listenable "fdbserver" "${FDB_PORT}"
echo "Run FDB monitor ..."
"${FDB_HOME}/fdbmonitor" \
--conffile "${FDB_HOME}/conf/fdb.conf" \
--lockfile "${FDB_HOME}/fdbmonitor.pid" \
--daemonize
}
function stop_fdb() {
fdb_pid_file="${FDB_HOME}/fdbmonitor.pid"
if [[ -f "${fdb_pid_file}" ]]; then
local fdb_pid
fdb_pid=$(cat "${fdb_pid_file}")
if ps -p "${fdb_pid}" >/dev/null; then
echo "Stop fdbmonitor with pid ${fdb_pid}"
kill -9 "${fdb_pid}"
rm -f "${fdb_pid_file}"
fi
fi
}
function check_fdb_running() {
if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then
local fdb_pid
fdb_pid=$(cat "${FDB_HOME}/fdbmonitor.pid")
if ps -p "${fdb_pid}" >/dev/null; then
echo "fdbmonitor with pid ${fdb_pid} is running, stop it first."
exit 1
fi
fi
}
function clean_fdb() {
if [[ -f "${FDB_HOME}/fdbmonitor.pid" ]]; then
local fdb_pid
fdb_pid=$(cat "${FDB_HOME}/fdbmonitor.pid")
if ps -p "${fdb_pid}" >/dev/null; then
echo "fdbmonitor with pid ${fdb_pid} is running, stop it first."
exit 1
fi
fi
sleep 1
# Check if FDB_HOME is set and not root
if [[ -z "${FDB_HOME}" || "${FDB_HOME}" == "/" ]]; then
echo "Error: FDB_HOME is not set or is set to root directory. Aborting cleanup."
exit 1
fi
# Check if FDB_HOME is empty
if [[ -z "$(ls -A "${FDB_HOME}")" ]]; then
echo "Error: FDB_HOME is empty. Nothing to clean."
exit 1
fi
# Remove all directories and files under ${FDB_HOME}
echo "Removing all directories and files under ${FDB_HOME}"
rm -rf "${FDB_HOME:?}"/*
}
function deploy() {
local job="$1"
if [[ ${job} =~ ^(all|fdb)$ ]]; then
deploy_fdb
fi
}
function start() {
local job="$1"
local init="$2"
if [[ ${job} =~ ^(all|fdb)$ ]]; then
start_fdb
fi
if [[ ${init} =~ ^(all|fdb)$ ]]; then
local fdb_mode
fdb_mode=$(get_fdb_mode)
echo "Try create database in fdb ${fdb_mode}"
"${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" \
--exec "configure new ${fdb_mode} ssd" ||
"${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" --exec "status" ||
(echo "failed to start fdb, please check that all nodes have same FDB_CLUSTER_ID" &&
exit 1)
fi
echo "Start fdb success, and you can set conf for MetaService:"
echo "fdb_cluster = $(cat "${FDB_HOME}"/conf/fdb.cluster)"
}
function stop() {
local job="$1"
if [[ ${job} =~ ^(all|fdb)$ ]]; then
stop_fdb &
fi
wait
}
function clean() {
local job="$1"
if [[ ${job} =~ ^(all|fdb)$ ]]; then
clean_fdb &
fi
wait
}
function status() {
pgrep -f "${FDB_CLUSTER_DESC}"
}
function usage() {
echo "Usage: $0 <CMD> "
echo -e "\t deploy \t setup fdb env (dir, binary, conf ...)"
echo -e "\t clean \t clean fdb data"
echo -e "\t start \t start fdb"
echo -e "\t stop \t stop fdb"
echo -e "\t fdbcli \t execute fdbcli"
echo -e ""
exit 1
}
function unknown_cmd() {
local cmd="$1"
printf "Unknown cmd: %s \n" "${cmd}"
usage
}
if [[ $# -lt 1 ]]; then
usage
fi
cmd="$1"
shift
job="fdb"
init="fdb"
case ${cmd} in
deploy)
deploy "${job}"
;;
start)
start "${job}" "${init}"
;;
stop)
stop "${job}"
;;
clean)
clean "${job}"
;;
fdbcli)
"${FDB_HOME}/fdbcli" -C "${FDB_HOME}/conf/fdb.cluster" "$@"
;;
config)
generate_regression_config true
;;
download)
download_fdb
;;
*)
unknown_cmd "${cmd}"
;;
esac