blob: 7075e71dc4ec67e0fd4d134c72aa7e29f7f1e119 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Scale In Out</title>
<link rel="stylesheet" href="/zh/assets/css/app.css">
<link rel="shortcut icon" href="/zh/assets/images/favicon.ico">
<link rel="stylesheet" href="/zh/assets/css/utilities.min.css">
<link rel="stylesheet" href="/zh/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Scale In Out | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.2" />
<meta property="og:title" content="Scale In Out" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="功能目标" />
<meta property="og:description" content="功能目标" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2023-11-23T14:51:44+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Scale In Out" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2023-11-23T14:51:44+00:00","datePublished":"2023-11-23T14:51:44+00:00","description":"功能目标","headline":"Scale In Out","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/scale-in-out"},"url":"/administration/scale-in-out"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/zh/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">Pegasus产品文档</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/downloads"
class="">
下载
</a>
</li>
</ul>
<p class="menu-label">编译构建</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/build/compile-by-docker"
class="">
使用Docker完成编译(推荐)
</a>
</li>
<li>
<a href="/zh/docs/build/compile-from-source"
class="">
从源码编译
</a>
</li>
</ul>
<p class="menu-label">客户端库</p>
<ul class="menu-list">
<li>
<a href="/zh/clients/java-client"
class="">
Java客户端
</a>
</li>
<li>
<a href="/zh/clients/cpp-client"
class="">
C++客户端
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang客户端
</a>
</li>
<li>
<a href="/zh/clients/python2-client"
class="">
Python2客户端
</a>
</li>
<li>
<a href="/zh/clients/python3-client"
class="">
Python3客户端
</a>
</li>
<li>
<a href="/zh/clients/node-client"
class="">
NodeJS客户端
</a>
</li>
<li>
<a href="/zh/clients/scala-client"
class="">
Scala客户端
</a>
</li>
</ul>
<p class="menu-label">生态工具</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/tools/shell"
class="">
Pegasus Shell 工具
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
集群管理命令行
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
数据访问命令行
</a>
</li>
</ul>
<p class="menu-label">用户接口</p>
<ul class="menu-list">
<li>
<a href="/zh/api/ttl"
class="">
TTL
</a>
</li>
<li>
<a href="/zh/api/single-atomic"
class="">
单行原子操作
</a>
</li>
<li>
<a href="/zh/api/redis"
class="">
Redis适配
</a>
</li>
<li>
<a href="/zh/api/geo"
class="">
GEO支持
</a>
</li>
<li>
<a href="/zh/api/http"
class="">
HTTP接口
</a>
</li>
</ul>
<p class="menu-label">高效运维</p>
<ul class="menu-list">
<li>
<a href="/zh/administration/deployment"
class="">
集群部署
</a>
</li>
<li>
<a href="/zh/administration/config"
class="">
配置说明
</a>
</li>
<li>
<a href="/zh/administration/rebalance"
class="">
负载均衡
</a>
</li>
<li>
<a href="/zh/administration/monitoring"
class="">
可视化监控
</a>
</li>
<li>
<a href="/zh/administration/rolling-update"
class="">
集群升级
</a>
</li>
<li>
<a href="/zh/administration/scale-in-out"
class="is-active">
集群扩容缩容
</a>
</li>
<li>
<a href="/zh/administration/resource-management"
class="">
资源管理
</a>
</li>
<li>
<a href="/zh/administration/cold-backup"
class="">
冷备份
</a>
</li>
<li>
<a href="/zh/administration/meta-recovery"
class="">
元数据恢复
</a>
</li>
<li>
<a href="/zh/administration/replica-recovery"
class="">
Replica数据恢复
</a>
</li>
<li>
<a href="/zh/administration/zk-migration"
class="">
Zookeeper迁移
</a>
</li>
<li>
<a href="/zh/administration/table-migration"
class="">
Table迁移
</a>
</li>
<li>
<a href="/zh/administration/table-soft-delete"
class="">
Table软删除
</a>
</li>
<li>
<a href="/zh/administration/table-env"
class="">
Table环境变量
</a>
</li>
<li>
<a href="/zh/administration/remote-commands"
class="">
远程命令
</a>
</li>
<li>
<a href="/zh/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/zh/administration/duplication"
class="">
跨机房同步
</a>
</li>
<li>
<a href="/zh/administration/compression"
class="">
数据压缩
</a>
</li>
<li>
<a href="/zh/administration/throttling"
class="">
流量控制
</a>
</li>
<li>
<a href="/zh/administration/experiences"
class="">
运维经验
</a>
</li>
<li>
<a href="/zh/administration/manual-compact"
class="">
Manual Compact功能
</a>
</li>
<li>
<a href="/zh/administration/usage-scenario"
class="">
Usage Scenario功能
</a>
</li>
<li>
<a href="/zh/administration/bad-disk"
class="">
坏盘检修
</a>
</li>
<li>
<a href="/zh/administration/whitelist"
class="">
白名单
</a>
</li>
<li>
<a href="/zh/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/zh/administration/hotspot-detection"
class="">
热点检测
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/zh/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/scale-in-out"><strong>En</strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Pegasus产品文档
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/downloads"
class="navbar-item ">
下载
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
编译构建
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/build/compile-by-docker"
class="navbar-item ">
使用Docker完成编译(推荐)
</a>
<a href="/zh/docs/build/compile-from-source"
class="navbar-item ">
从源码编译
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
客户端库
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/clients/java-client"
class="navbar-item ">
Java客户端
</a>
<a href="/zh/clients/cpp-client"
class="navbar-item ">
C++客户端
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang客户端
</a>
<a href="/zh/clients/python2-client"
class="navbar-item ">
Python2客户端
</a>
<a href="/zh/clients/python3-client"
class="navbar-item ">
Python3客户端
</a>
<a href="/zh/clients/node-client"
class="navbar-item ">
NodeJS客户端
</a>
<a href="/zh/clients/scala-client"
class="navbar-item ">
Scala客户端
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
生态工具
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/tools/shell"
class="navbar-item ">
Pegasus Shell 工具
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
集群管理命令行
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
数据访问命令行
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
用户接口
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/api/ttl"
class="navbar-item ">
TTL
</a>
<a href="/zh/api/single-atomic"
class="navbar-item ">
单行原子操作
</a>
<a href="/zh/api/redis"
class="navbar-item ">
Redis适配
</a>
<a href="/zh/api/geo"
class="navbar-item ">
GEO支持
</a>
<a href="/zh/api/http"
class="navbar-item ">
HTTP接口
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
高效运维
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/administration/deployment"
class="navbar-item ">
集群部署
</a>
<a href="/zh/administration/config"
class="navbar-item ">
配置说明
</a>
<a href="/zh/administration/rebalance"
class="navbar-item ">
负载均衡
</a>
<a href="/zh/administration/monitoring"
class="navbar-item ">
可视化监控
</a>
<a href="/zh/administration/rolling-update"
class="navbar-item ">
集群升级
</a>
<a href="/zh/administration/scale-in-out"
class="navbar-item is-active">
集群扩容缩容
</a>
<a href="/zh/administration/resource-management"
class="navbar-item ">
资源管理
</a>
<a href="/zh/administration/cold-backup"
class="navbar-item ">
冷备份
</a>
<a href="/zh/administration/meta-recovery"
class="navbar-item ">
元数据恢复
</a>
<a href="/zh/administration/replica-recovery"
class="navbar-item ">
Replica数据恢复
</a>
<a href="/zh/administration/zk-migration"
class="navbar-item ">
Zookeeper迁移
</a>
<a href="/zh/administration/table-migration"
class="navbar-item ">
Table迁移
</a>
<a href="/zh/administration/table-soft-delete"
class="navbar-item ">
Table软删除
</a>
<a href="/zh/administration/table-env"
class="navbar-item ">
Table环境变量
</a>
<a href="/zh/administration/remote-commands"
class="navbar-item ">
远程命令
</a>
<a href="/zh/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/zh/administration/duplication"
class="navbar-item ">
跨机房同步
</a>
<a href="/zh/administration/compression"
class="navbar-item ">
数据压缩
</a>
<a href="/zh/administration/throttling"
class="navbar-item ">
流量控制
</a>
<a href="/zh/administration/experiences"
class="navbar-item ">
运维经验
</a>
<a href="/zh/administration/manual-compact"
class="navbar-item ">
Manual Compact功能
</a>
<a href="/zh/administration/usage-scenario"
class="navbar-item ">
Usage Scenario功能
</a>
<a href="/zh/administration/bad-disk"
class="navbar-item ">
坏盘检修
</a>
<a href="/zh/administration/whitelist"
class="navbar-item ">
白名单
</a>
<a href="/zh/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/zh/administration/hotspot-detection"
class="navbar-item ">
热点检测
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/scale-in-out"><strong>En</strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">集群扩容缩容</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="功能目标">功能目标</h1>
<p>当集群容量不够或者读写压力太大了,需要通过增加节点来扩容;当集群承担的业务规模缩减时,可以通过减少节点来缩容。</p>
<p>扩容和缩容时,需要考虑这些点:</p>
<ul>
<li>不停服</li>
<li>尽量不要影响可用性</li>
<li>尽量减少不必要的数据拷贝</li>
</ul>
<h1 id="扩容流程">扩容流程</h1>
<p>扩容流程非常简单:</p>
<ul>
<li>要扩容多个机器,就在这些新增机器上启动replica server进程,启动后replica server会主动联系meta server,加入节点列表中。</li>
<li>在meta level为steady的情况下,不会进行<a href="rebalance">负载均衡</a>,因此用shell工具的<code class="language-plaintext highlighter-rouge">nodes -d</code>命令查看,可以看到新节点的状态为ALIVE,但是服务的replica个数为0。</li>
<li>通过shell工具的<code class="language-plaintext highlighter-rouge">set_meta_level lively</code>启动负载均衡,meta server会逐渐将部分replica迁移到新节点上。</li>
<li>通过shell工具的<code class="language-plaintext highlighter-rouge">nodes -d</code>命令查看个节点服务replica的情况,在达到均衡状态后,通过<code class="language-plaintext highlighter-rouge">set_meta_level steady</code>关闭负载均衡,扩容完成。</li>
</ul>
<h1 id="缩容流程">缩容流程</h1>
<p>缩容相对扩容来说要考虑的点就多些,主要包括:</p>
<ul>
<li>如果同时要下线多个节点,需要一个一个进行,等一个下线完成后再下线另一个,避免影响集群的可用度和数据的安全性。</li>
<li>如果同时要下线多个节点,在下线一个节点时,meta server补充备份要避免将备份分派在即将要下线的其他节点上,不然等后面下线其他节点时,又要重新补充备份,造成不必要的跨节点数据拷贝。我们提供了black_list来支持这个功能。</li>
<li><strong>需要注意</strong>:节点下线后,在meta server上的状态会变成UNALIVE,可能会造成ALIVE的节点比例低于配置参数<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>,如果低于了限制,meta server就会自动降级为freezed状态,此时所有的<code class="language-plaintext highlighter-rouge">reconfiguration</code>操作都无法进行,缩容流程也进行不下去。所以在缩容之前需要计算一下是否会造成这种情况,如果会,就先升级meta server的配置,将<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>修改至足够低,保证在缩容过程中meta server不会自动降级为freezed状态。</li>
</ul>
<p>推荐的缩容流程:</p>
<ul>
<li>计算缩容后ALIVE的节点比例会不会低于配置参数<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>,如果低于,就将该配置参数改小些,然后升级meta server。</li>
<li>使用shell工具将集群的meta * level设置为steady,关闭<a href="rebalance">负载均衡功能</a>,避免不必要的replica迁移。
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; set_meta_level steady
</code></pre></div> </div>
</li>
<li>使用shell工具向meta server发送<a href="remote-commands#meta-server">远程命令</a>,设置black_list:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.assign_secondary_black_list $address_list
</code></pre></div> </div>
<p>其中<code class="language-plaintext highlighter-rouge">address_list</code>是要下线节点的ip:port地址列表,用逗号分隔。</p>
</li>
<li>使用shell工具将<code class="language-plaintext highlighter-rouge">assign_delay_ms</code>设为10,这样做的目的是让节点下线后,立即在其他节点上补充备份:
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; remote_command -t meta-server meta.lb.assign_delay_ms 10
</code></pre></div> </div>
</li>
<li>逐个下线replica server。单个replica server下线流程:
<ul>
<li>kill掉replica server进程。</li>
<li>使用shell的<code class="language-plaintext highlighter-rouge">ls -d</code>命令查看集群状态,等待所有partition都完全恢复健康(所有表的unhealthy数都为0)。</li>
<li>清理该节点上的数据,释放存储空间。</li>
<li>继续下线下一个replica server。</li>
</ul>
</li>
<li>重启meta server:
<ul>
<li>重启是为了重置上面动态修改过的配置,并且让shell的<code class="language-plaintext highlighter-rouge">nodes -d</code>不再显示已经下线的节点。</li>
<li>如果之前调整过配置参数<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>,重启时需要修改配置文件,再其调整为合适的值。</li>
</ul>
</li>
</ul>
<p>以上过程可以自动化,我们提供了集群升级脚本<a href="https://github.com/apache/incubator-pegasus/blob/master/scripts/pegasus_offline_node_list.sh">scripts/pegasus_offline_node_list.sh</a>。不过这个脚本并不能直接使用,因为其依赖minos部署工具来完成进程的远程stop操作。<code class="language-plaintext highlighter-rouge">pegasus_offline_node_list.sh</code>调用<code class="language-plaintext highlighter-rouge">pegasus_offline_node.sh</code>,因此这两个脚本的minos_client_dir都需要更改。你可以针对你们自己的部署系统,修改脚本中minos相关部分,使其可以正常工作。如需帮助,请联系我们。</p>
<p>注意:在使用集群升级脚本的时候,也要保证配置参数<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>的值足够小(有必要可以先升级meta-server),避免使集群进入freezed状态。</p>
<h1 id="节点迁移">节点迁移</h1>
<p>通过先<strong>扩容</strong><strong>缩容</strong>,可以实现集群的节点迁移。为了尽量减少数据的重复拷贝和移动,建议按照如下步骤:</p>
<ul>
<li>先扩容:把需要扩容的机器加入到集群中,但是在加入后<strong>暂时不进行负载均衡</strong></li>
<li>再缩容:将需要缩容的机器通过上面的<a href="#缩容流程">缩容流程</a>进行下线,<strong>特别注意参数<code class="language-plaintext highlighter-rouge">node_live_percentage_threshold_for_update</code>的配置</strong></li>
<li>进行<a href="rebalance">负载均衡</a></li>
</ul>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
本页导航
</p>
<ul class="menu-list">
<li><a href="#功能目标">功能目标</a></li>
<li><a href="#扩容流程">扩容流程</a></li>
<li><a href="#缩容流程">缩容流程</a></li>
<li><a href="#节点迁移">节点迁移</a></li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>