blob: 652288d6f535defbe4480983fbd5c4a21a1c2dad [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Rolling Update</title>
<link rel="stylesheet" href="/assets/css/app.css">
<link rel="shortcut icon" href="/assets/images/favicon.ico">
<link rel="stylesheet" href="/assets/css/utilities.min.css">
<link rel="stylesheet" href="/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Rolling Update | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Rolling Update" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Design goals" />
<meta property="og:description" content="Design goals" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T13:02:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Rolling Update" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T13:02:52+00:00","datePublished":"2024-04-22T13:02:52+00:00","description":"Design goals","headline":"Rolling Update","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/rolling-update"},"url":"/administration/rolling-update"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">The Pegasus documentation</p>
<ul class="menu-list">
<li>
<a href="/docs/downloads"
class="">
Downloads
</a>
</li>
</ul>
<p class="menu-label">Building Pegasus</p>
<ul class="menu-list">
<li>
<a href="/docs/build/compile-by-docker"
class="">
Compile by docker (recommended)
</a>
</li>
<li>
<a href="/docs/build/compile-from-source"
class="">
Compile from source
</a>
</li>
</ul>
<p class="menu-label">Client Libs</p>
<ul class="menu-list">
<li>
<a href="/clients/java-client"
class="">
Java Client
</a>
</li>
<li>
<a href="/clients/cpp-client"
class="">
C++ Client
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang Client
</a>
</li>
<li>
<a href="/clients/python-client"
class="">
Python Client
</a>
</li>
<li>
<a href="/clients/node-client"
class="">
NodeJS Client
</a>
</li>
<li>
<a href="/clients/scala-client"
class="">
Scala Client
</a>
</li>
</ul>
<p class="menu-label">Tools</p>
<ul class="menu-list">
<li>
<a href="/docs/tools/shell"
class="">
Pegasus Shell
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
Admin CLI
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
Pegasus data access CLI
</a>
</li>
</ul>
<p class="menu-label">API</p>
<ul class="menu-list">
<li>
<a href="/api/ttl"
class="">
TTL(Time To Live)
</a>
</li>
<li>
<a href="/api/single-atomic"
class="">
Single-Atomic Operations
</a>
</li>
<li>
<a href="/api/redis"
class="">
Redis Adaption
</a>
</li>
<li>
<a href="/api/geo"
class="">
GEO Support
</a>
</li>
<li>
<a href="/api/http"
class="">
HTTP API
</a>
</li>
</ul>
<p class="menu-label">Admin</p>
<ul class="menu-list">
<li>
<a href="/administration/deployment"
class="">
Deployment
</a>
</li>
<li>
<a href="/administration/config"
class="">
Configurations
</a>
</li>
<li>
<a href="/administration/rebalance"
class="">
Rebalance
</a>
</li>
<li>
<a href="/administration/monitoring"
class="">
Monitoring
</a>
</li>
<li>
<a href="/administration/rolling-update"
class="is-active">
Rolling Restart and Upgrade
</a>
</li>
<li>
<a href="/administration/scale-in-out"
class="">
Scale-in and Scale-out
</a>
</li>
<li>
<a href="/administration/resource-management"
class="">
Resource Management
</a>
</li>
<li>
<a href="/administration/cold-backup"
class="">
Cold Backup
</a>
</li>
<li>
<a href="/administration/meta-recovery"
class="">
Metadata Recovery
</a>
</li>
<li>
<a href="/administration/replica-recovery"
class="">
Replica Data Recovery
</a>
</li>
<li>
<a href="/administration/zk-migration"
class="">
Zookeeper Migration
</a>
</li>
<li>
<a href="/administration/table-migration"
class="">
Table Migration
</a>
</li>
<li>
<a href="/administration/table-soft-delete"
class="">
Table Soft-Delete
</a>
</li>
<li>
<a href="/administration/table-env"
class="">
Table Environment Variables
</a>
</li>
<li>
<a href="/administration/remote-commands"
class="">
Remote Command
</a>
</li>
<li>
<a href="/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/administration/duplication"
class="">
Duplication
</a>
</li>
<li>
<a href="/administration/compression"
class="">
Data Compression
</a>
</li>
<li>
<a href="/administration/throttling"
class="">
Throttling
</a>
</li>
<li>
<a href="/administration/experiences"
class="">
Experiences
</a>
</li>
<li>
<a href="/administration/manual-compact"
class="">
Manual Compact
</a>
</li>
<li>
<a href="/administration/usage-scenario"
class="">
Usage Scenario
</a>
</li>
<li>
<a href="/administration/bad-disk"
class="">
Bad Disk Repair
</a>
</li>
<li>
<a href="/administration/whitelist"
class="">
Replica Server Whitelist
</a>
</li>
<li>
<a href="/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/administration/hotspot-detection"
class="">
Hotspot Detection
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/rolling-update"><strong></strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
The Pegasus documentation
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/downloads"
class="navbar-item ">
Downloads
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Building Pegasus
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/build/compile-by-docker"
class="navbar-item ">
Compile by docker (recommended)
</a>
<a href="/docs/build/compile-from-source"
class="navbar-item ">
Compile from source
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Client Libs
</span>
</a>
<div class="navbar-dropdown">
<a href="/clients/java-client"
class="navbar-item ">
Java Client
</a>
<a href="/clients/cpp-client"
class="navbar-item ">
C++ Client
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang Client
</a>
<a href="/clients/python-client"
class="navbar-item ">
Python Client
</a>
<a href="/clients/node-client"
class="navbar-item ">
NodeJS Client
</a>
<a href="/clients/scala-client"
class="navbar-item ">
Scala Client
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Tools
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/tools/shell"
class="navbar-item ">
Pegasus Shell
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
Admin CLI
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
Pegasus data access CLI
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
API
</span>
</a>
<div class="navbar-dropdown">
<a href="/api/ttl"
class="navbar-item ">
TTL(Time To Live)
</a>
<a href="/api/single-atomic"
class="navbar-item ">
Single-Atomic Operations
</a>
<a href="/api/redis"
class="navbar-item ">
Redis Adaption
</a>
<a href="/api/geo"
class="navbar-item ">
GEO Support
</a>
<a href="/api/http"
class="navbar-item ">
HTTP API
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Admin
</span>
</a>
<div class="navbar-dropdown">
<a href="/administration/deployment"
class="navbar-item ">
Deployment
</a>
<a href="/administration/config"
class="navbar-item ">
Configurations
</a>
<a href="/administration/rebalance"
class="navbar-item ">
Rebalance
</a>
<a href="/administration/monitoring"
class="navbar-item ">
Monitoring
</a>
<a href="/administration/rolling-update"
class="navbar-item is-active">
Rolling Restart and Upgrade
</a>
<a href="/administration/scale-in-out"
class="navbar-item ">
Scale-in and Scale-out
</a>
<a href="/administration/resource-management"
class="navbar-item ">
Resource Management
</a>
<a href="/administration/cold-backup"
class="navbar-item ">
Cold Backup
</a>
<a href="/administration/meta-recovery"
class="navbar-item ">
Metadata Recovery
</a>
<a href="/administration/replica-recovery"
class="navbar-item ">
Replica Data Recovery
</a>
<a href="/administration/zk-migration"
class="navbar-item ">
Zookeeper Migration
</a>
<a href="/administration/table-migration"
class="navbar-item ">
Table Migration
</a>
<a href="/administration/table-soft-delete"
class="navbar-item ">
Table Soft-Delete
</a>
<a href="/administration/table-env"
class="navbar-item ">
Table Environment Variables
</a>
<a href="/administration/remote-commands"
class="navbar-item ">
Remote Command
</a>
<a href="/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/administration/duplication"
class="navbar-item ">
Duplication
</a>
<a href="/administration/compression"
class="navbar-item ">
Data Compression
</a>
<a href="/administration/throttling"
class="navbar-item ">
Throttling
</a>
<a href="/administration/experiences"
class="navbar-item ">
Experiences
</a>
<a href="/administration/manual-compact"
class="navbar-item ">
Manual Compact
</a>
<a href="/administration/usage-scenario"
class="navbar-item ">
Usage Scenario
</a>
<a href="/administration/bad-disk"
class="navbar-item ">
Bad Disk Repair
</a>
<a href="/administration/whitelist"
class="navbar-item ">
Replica Server Whitelist
</a>
<a href="/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/administration/hotspot-detection"
class="navbar-item ">
Hotspot Detection
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/rolling-update"><strong></strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">Rolling Restart and Upgrade</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="design-goals">Design goals</h1>
<p>When upgrading the Pegasus server version or persistently modifying the configuration, it is necessary to restart the cluster. For distributed clusters, the commonly used restart method is <strong>Rolling Restart</strong>, which means restarting servers one by one without stopping cluster service.</p>
<blockquote>
<p>The following document assumes that the number of replicas of tables in the Pegasus cluster is 3.</p>
</blockquote>
<p>The important goal of cluster restart is to maintain continuous service and minimize the impact on availability. During the restart process, the following factors can affect service availability:</p>
<ul>
<li>After the Replica Server process is killed, the replicas served by the process cannot provide services:
<ul>
<li>For primary replicas: Since the primary replicas directly provide reading and writing services to the client, killing a process will definitely affect read and write operations, and it needs to wait for the Meta Server to reassign new primary replicas before it can be recovered. The Meta Server maintenance the survival status of the Replica Servers through beacons, and the latency of Failure Detector depends on the configuration parameter <code class="language-plaintext highlighter-rouge">fd_grace_seconds</code>, default to 10 seconds, which means it takes up to 10 seconds for the Meta Server to know that the Replica Server is down, and then reassign new primary replicas.</li>
<li>For secondary replicas: Since the secondary replicas do not serve reads, theoretically they have no impact on reads. But it will affect writing because the PacificA consistency protocol requires all replicas to be written successfully before the write operation can be submitted. After the process is killed, the primary replica will find that the secondary replica has been lost during the write operation, and then notify the Meta Server to kick it out. After the <em>configuration</em> stage, the replica group is combined by one primary and one secondary replica, then continuing to provide write services. For write operations that have not yet been completed during this switching process, even if there is a <em>reconciliation</em> stage to execute again, the client may have timed out, which has a certain impact on availability. However, this impact is relatively small because the speed of <em>reconfiguration</em> is relatively fast and can usually be completed within 1 second.</li>
</ul>
</li>
<li>Restarting Meta Server: The impact of restarting Meta Server on availability can be almost negligible. Because the client retrieves the service node information for each partition from the Meta Server for the first time and caches the information locally, there is usually no need to query from Meta Server again. Therefore, a short disconnection during the Meta Server restart process has little impact on the client. However, considering that the Meta Server needs to maintain beacons with the Replica Server, it is important to avoid stopping the Meta Server process for a long time, which could cause the Replica Server to be disconnected.</li>
<li>Restarting the Collector: Restarting the Collector has no impact on availability. However, availability metrics are collected from the Collector, so it may have a slight impact on the metrics data.</li>
</ul>
<p>Therefore, the following points can be considered to keep availability during cluster restart:</p>
<ul>
<li>Only one process can be restarted at a time, and the next process can only be restarted after the process is restarted and fully recovered to provide service. Because:
<ul>
<li>If the cluster does not recover to a fully healthy state after restarting a process, and some partitions still have only one primary and one secondary replica, then killing another Replica Server process is likely to enter a state with only one primary replica, making it unable to provide write service.</li>
<li>Waiting for all partitions in the cluster to recover three replicas before restarting the next process can also reduce the risk of data loss.</li>
</ul>
</li>
<li>Proactively migrate replicas before Failure Detector delays impact availability, instead passively migrate. Because:
<ul>
<li>Passive migration requires waiting for the Failure Detector to detect Replica Server loss, while proactive migration involves migrating the primary replicas served by this server to other servers before killing the process. This <code class="language-plaintext highlighter-rouge">reconfiguration</code> procedure is fast and typically takes less than 1 second to complete.</li>
</ul>
</li>
<li>Try to manually downgrade the secondary replicas of the Replica Server served before killing the process. Because:
<ul>
<li>Proactively trigger the <code class="language-plaintext highlighter-rouge">reconfiguration</code> rather than passive triggering on write failures, further reducing the impact on availability.</li>
</ul>
</li>
<li>Minimize the workload of the recovery process during process restart to shorten the process restart time.
<ul>
<li>Replica Server requires replay WAL logs to recover data upon restart. If it is killed directly, the amount of data that needs to be replayed may be large. However, if the flush operation of memtables to disk is actively triggered before killing, the amount of data that needs to be replayed during restart will be greatly reduced, and the restart time will be much shorter. The time required for the entire cluster to restart can also be greatly reduced.</li>
</ul>
</li>
<li>Minimize unnecessary data transmission between servers to avoid availability impacts caused by high load of CPU, network IO, and disk IO when transmit data.
<ul>
<li>After the Replica Server crashes, some partitions enter the state of <code class="language-plaintext highlighter-rouge">1 primary + 1 secondary</code>. If the Meta Server immediately supplements replicas on other Replica Servers, it will bring about a large number of cross server data transmission, increase CPU, network IO, and disk IO load, and affect cluster stability. Pegasus’s solution to this problem is to allow <code class="language-plaintext highlighter-rouge">1 primary + 1 secondary</code> state for a period of time, providing a maintenance window for the restarted Replica Server. If it’s not recovered for too long time, the missing replicas will be replenished on other Replica Servers. This balances the data integrity and the stability of the cluster. The wait time can be configured though the parameter <code class="language-plaintext highlighter-rouge">replica_assign_delay_ms_for_dropouts</code>, default to 5 minutes.</li>
</ul>
</li>
</ul>
<h1 id="restart-steps">Restart steps</h1>
<h2 id="high-availability-restart-steps">High availability restart steps</h2>
<ul>
<li>If it is an upgrade, please prepare new server packages and configuration files first</li>
<li>Use shell tools to set the meta level of the cluster to <code class="language-plaintext highlighter-rouge">steady</code>, turn off <a href="rebalance">load balancing</a>, and avoid unnecessary replica migration
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; set_meta_level steady
</code></pre></div> </div>
</li>
<li>Use shell tools to set the maintenance window of a single Replica Server
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.assign_delay_ms $value
</code></pre></div> </div>
<p><code class="language-plaintext highlighter-rouge">value</code> can be understood as the maintenance window of a single Replica Server, which is the trigger time for the Meta Server to supplement replicas to other servers after discovering that the Replica Server is lost. For example, configure to <code class="language-plaintext highlighter-rouge">3600000</code>.</p>
</li>
<li>Restart the Replica Server process one by one. Restart a single Replica Server steps:
<ul>
<li>Use shell tools to send <a href="remote-commands#meta-server">remote commands</a> to Meta Server, temporarily disable <code class="language-plaintext highlighter-rouge">add_secondary</code> operations:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.add_secondary_max_count_for_one_node 0
</code></pre></div> </div>
</li>
<li>Use <code class="language-plaintext highlighter-rouge">migrate_node</code> command to transfer all primary replicas on the Replica Server to other servers:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>./run.sh migrate_node <span class="nt">-c</span> <span class="nv">$meta_list</span> <span class="nt">-n</span> <span class="nv">$node</span> <span class="nt">-t</span> run
</code></pre></div> </div>
<p>Use shell tools to check the replicas of the servers served through the <code class="language-plaintext highlighter-rouge">nodes -d</code> command, and wait for the number of <strong>primary</strong> replicas to become 0. If it doesn’t change to 0 for a long time, please execute the command again.</p>
</li>
<li>Use <code class="language-plaintext highlighter-rouge">downgrade_node</code> command to downgrade all secondary replicas on the Replica Server to <code class="language-plaintext highlighter-rouge">INACTIVE</code>:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>./run.sh downgrade_node <span class="nt">-c</span> <span class="nv">$meta_list</span> <span class="nt">-n</span> <span class="nv">$node</span> <span class="nt">-t</span> run
</code></pre></div> </div>
<p>Use shell tools to check the replicas of the servers served through the <code class="language-plaintext highlighter-rouge">nodes -d</code> command, and wait for the number of <strong>secondary</strong> replicas to become 0. If it doesn’t change to 0 for a long time, please execute the command again.</p>
</li>
<li>Use shell tools to send a remote command to the Replica Server to close all replicas and trigger flush operations:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -l $node replica.kill_partition
</code></pre></div> </div>
<p>Wait for about 1 minute for the data to be flushed to the disk to complete.</p>
</li>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Replica Server process</li>
<li>Use shell tools to send <a href="remote-commands#meta-server">remote commands</a> to Meta Server, enable <code class="language-plaintext highlighter-rouge">add_secondary</code> operations, let it quickly supplement replicas:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.add_secondary_max_count_for_one_node 100
</code></pre></div> </div>
</li>
<li>Use the <code class="language-plaintext highlighter-rouge">ls - d</code> command of the shell tool to check the cluster status and wait for all partitions to fully recover health</li>
<li>Continue with the next Replica Server</li>
</ul>
</li>
<li>Restart the Meta Server process one by one. Restart a single Meta Server steps:
<ul>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Meta Server process</li>
<li>Wait for more than 30 seconds to ensure the continuity of beacons between Meta Server and Replica Servers</li>
<li>Continue with the next Meta Server</li>
</ul>
</li>
<li>Restart the Collector process:
<ul>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Collector process</li>
</ul>
</li>
<li>Reset configurations
<ul>
<li>Reset the configurations modified in the above steps using shell tools:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.add_secondary_max_count_for_one_node DEFAULT
&gt;&gt;&gt; remote_command -t meta-server meta.lb.assign_delay_ms DEFAULT
</code></pre></div> </div>
</li>
</ul>
</li>
</ul>
<h2 id="simplified-restart-steps">Simplified restart steps</h2>
<p>If the availability requirement is not high, the restart steps can be simplified as follows:</p>
<ul>
<li>If it is an upgrade, please prepare new server packages and configuration files first</li>
<li>Use shell tools to set the meta level of the cluster to <code class="language-plaintext highlighter-rouge">steady</code>, turn off <a href="rebalance">load balancing</a>, and avoid unnecessary replica migration
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; set_meta_level steady
</code></pre></div> </div>
</li>
<li>Restart the Replica Server process one by one. Restart a single Replica Server steps:
<ul>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Replica Server process</li>
<li>Use the <code class="language-plaintext highlighter-rouge">ls - d</code> command of the shell tool to check the cluster status and wait for all partitions to fully recover health</li>
<li>Continue with the next Replica Server</li>
</ul>
</li>
<li>Restart the Meta Server process one by one. Restart a single Meta Server steps:
<ul>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Meta Server process</li>
<li>Wait for more than 30 seconds to ensure the continuity of beacons between Meta Server and Replica Servers</li>
<li>Continue with the next Meta Server</li>
</ul>
</li>
<li>Restart the Collector process:
<ul>
<li>If it is an upgrade, replace the package and configuration file</li>
<li>Restart the Collector process</li>
</ul>
</li>
</ul>
<h1 id="restart-script">Restart script</h1>
<p>It can be referenced the script based on <a href="https://github.com/XiaoMi/minos">Minos</a> and <a href="#high-availability-restart-steps">High availability restart steps</a>: <a href="https://github.com/apache/incubator-pegasus/blob/master/scripts/pegasus_rolling_update.sh">scripts/pegasus_rolling_update.sh</a>.</p>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
Table of contents
</p>
<ul class="menu-list">
<li><a href="#design-goals">Design goals</a></li>
<li><a href="#restart-steps">Restart steps</a>
<ul>
<li><a href="#high-availability-restart-steps">High availability restart steps</a></li>
<li><a href="#simplified-restart-steps">Simplified restart steps</a></li>
</ul>
</li>
<li><a href="#restart-script">Restart script</a></li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>