blob: 61c504d1f33657f2a8a3d0a41a566db0dcabc8f0 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Experiences</title>
<link rel="stylesheet" href="/zh/assets/css/app.css">
<link rel="shortcut icon" href="/zh/assets/images/favicon.ico">
<link rel="stylesheet" href="/zh/assets/css/utilities.min.css">
<link rel="stylesheet" href="/zh/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Experiences | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Experiences" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="一个分布式系统的Meta Server管理工作包含周期巡检,监控报警,故障排查,接入审核等,通过这些手段来帮助服务稳定运行。" />
<meta property="og:description" content="一个分布式系统的Meta Server管理工作包含周期巡检,监控报警,故障排查,接入审核等,通过这些手段来帮助服务稳定运行。" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T06:39:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Experiences" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T06:39:52+00:00","datePublished":"2024-04-22T06:39:52+00:00","description":"一个分布式系统的Meta Server管理工作包含周期巡检,监控报警,故障排查,接入审核等,通过这些手段来帮助服务稳定运行。","headline":"Experiences","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/experiences"},"url":"/administration/experiences"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/zh/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">Pegasus 产品文档</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/downloads"
class="">
下载
</a>
</li>
</ul>
<p class="menu-label">编译构建</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/build/compile-by-docker"
class="">
使用 Docker 完成编译(推荐)
</a>
</li>
<li>
<a href="/zh/docs/build/compile-from-source"
class="">
从源码编译
</a>
</li>
</ul>
<p class="menu-label">客户端库</p>
<ul class="menu-list">
<li>
<a href="/zh/clients/java-client"
class="">
Java 客户端
</a>
</li>
<li>
<a href="/zh/clients/cpp-client"
class="">
C++ 客户端
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang 客户端
</a>
</li>
<li>
<a href="/zh/clients/python-client"
class="">
Python 客户端
</a>
</li>
<li>
<a href="/zh/clients/node-client"
class="">
NodeJS 客户端
</a>
</li>
<li>
<a href="/zh/clients/scala-client"
class="">
Scala 客户端
</a>
</li>
</ul>
<p class="menu-label">生态工具</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/tools/shell"
class="">
Pegasus Shell 工具
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
集群管理命令行
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
数据访问命令行
</a>
</li>
</ul>
<p class="menu-label">用户接口</p>
<ul class="menu-list">
<li>
<a href="/zh/api/ttl"
class="">
TTL
</a>
</li>
<li>
<a href="/zh/api/single-atomic"
class="">
单行原子操作
</a>
</li>
<li>
<a href="/zh/api/redis"
class="">
Redis 适配
</a>
</li>
<li>
<a href="/zh/api/geo"
class="">
GEO 支持
</a>
</li>
<li>
<a href="/zh/api/http"
class="">
HTTP 接口
</a>
</li>
</ul>
<p class="menu-label">高效运维</p>
<ul class="menu-list">
<li>
<a href="/zh/administration/deployment"
class="">
集群部署
</a>
</li>
<li>
<a href="/zh/administration/config"
class="">
配置说明
</a>
</li>
<li>
<a href="/zh/administration/rebalance"
class="">
负载均衡
</a>
</li>
<li>
<a href="/zh/administration/monitoring"
class="">
可视化监控
</a>
</li>
<li>
<a href="/zh/administration/rolling-update"
class="">
集群重启和升级
</a>
</li>
<li>
<a href="/zh/administration/scale-in-out"
class="">
集群扩容缩容
</a>
</li>
<li>
<a href="/zh/administration/resource-management"
class="">
资源管理
</a>
</li>
<li>
<a href="/zh/administration/cold-backup"
class="">
冷备份
</a>
</li>
<li>
<a href="/zh/administration/meta-recovery"
class="">
元数据恢复
</a>
</li>
<li>
<a href="/zh/administration/replica-recovery"
class="">
Replica 数据恢复
</a>
</li>
<li>
<a href="/zh/administration/zk-migration"
class="">
Zookeeper 迁移
</a>
</li>
<li>
<a href="/zh/administration/table-migration"
class="">
Table 迁移
</a>
</li>
<li>
<a href="/zh/administration/table-soft-delete"
class="">
Table 软删除
</a>
</li>
<li>
<a href="/zh/administration/table-env"
class="">
Table 环境变量
</a>
</li>
<li>
<a href="/zh/administration/remote-commands"
class="">
远程命令
</a>
</li>
<li>
<a href="/zh/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/zh/administration/duplication"
class="">
跨机房同步
</a>
</li>
<li>
<a href="/zh/administration/compression"
class="">
数据压缩
</a>
</li>
<li>
<a href="/zh/administration/throttling"
class="">
流量控制
</a>
</li>
<li>
<a href="/zh/administration/experiences"
class="is-active">
运维经验
</a>
</li>
<li>
<a href="/zh/administration/manual-compact"
class="">
Manual Compact 功能
</a>
</li>
<li>
<a href="/zh/administration/usage-scenario"
class="">
Usage Scenario 功能
</a>
</li>
<li>
<a href="/zh/administration/bad-disk"
class="">
坏盘检修
</a>
</li>
<li>
<a href="/zh/administration/whitelist"
class="">
Replica Server 白名单
</a>
</li>
<li>
<a href="/zh/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/zh/administration/hotspot-detection"
class="">
热点检测
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/zh/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/experiences"><strong>En</strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Pegasus 产品文档
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/downloads"
class="navbar-item ">
下载
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
编译构建
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/build/compile-by-docker"
class="navbar-item ">
使用 Docker 完成编译(推荐)
</a>
<a href="/zh/docs/build/compile-from-source"
class="navbar-item ">
从源码编译
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
客户端库
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/clients/java-client"
class="navbar-item ">
Java 客户端
</a>
<a href="/zh/clients/cpp-client"
class="navbar-item ">
C++ 客户端
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang 客户端
</a>
<a href="/zh/clients/python-client"
class="navbar-item ">
Python 客户端
</a>
<a href="/zh/clients/node-client"
class="navbar-item ">
NodeJS 客户端
</a>
<a href="/zh/clients/scala-client"
class="navbar-item ">
Scala 客户端
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
生态工具
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/tools/shell"
class="navbar-item ">
Pegasus Shell 工具
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
集群管理命令行
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
数据访问命令行
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
用户接口
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/api/ttl"
class="navbar-item ">
TTL
</a>
<a href="/zh/api/single-atomic"
class="navbar-item ">
单行原子操作
</a>
<a href="/zh/api/redis"
class="navbar-item ">
Redis 适配
</a>
<a href="/zh/api/geo"
class="navbar-item ">
GEO 支持
</a>
<a href="/zh/api/http"
class="navbar-item ">
HTTP 接口
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
高效运维
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/administration/deployment"
class="navbar-item ">
集群部署
</a>
<a href="/zh/administration/config"
class="navbar-item ">
配置说明
</a>
<a href="/zh/administration/rebalance"
class="navbar-item ">
负载均衡
</a>
<a href="/zh/administration/monitoring"
class="navbar-item ">
可视化监控
</a>
<a href="/zh/administration/rolling-update"
class="navbar-item ">
集群重启和升级
</a>
<a href="/zh/administration/scale-in-out"
class="navbar-item ">
集群扩容缩容
</a>
<a href="/zh/administration/resource-management"
class="navbar-item ">
资源管理
</a>
<a href="/zh/administration/cold-backup"
class="navbar-item ">
冷备份
</a>
<a href="/zh/administration/meta-recovery"
class="navbar-item ">
元数据恢复
</a>
<a href="/zh/administration/replica-recovery"
class="navbar-item ">
Replica 数据恢复
</a>
<a href="/zh/administration/zk-migration"
class="navbar-item ">
Zookeeper 迁移
</a>
<a href="/zh/administration/table-migration"
class="navbar-item ">
Table 迁移
</a>
<a href="/zh/administration/table-soft-delete"
class="navbar-item ">
Table 软删除
</a>
<a href="/zh/administration/table-env"
class="navbar-item ">
Table 环境变量
</a>
<a href="/zh/administration/remote-commands"
class="navbar-item ">
远程命令
</a>
<a href="/zh/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/zh/administration/duplication"
class="navbar-item ">
跨机房同步
</a>
<a href="/zh/administration/compression"
class="navbar-item ">
数据压缩
</a>
<a href="/zh/administration/throttling"
class="navbar-item ">
流量控制
</a>
<a href="/zh/administration/experiences"
class="navbar-item is-active">
运维经验
</a>
<a href="/zh/administration/manual-compact"
class="navbar-item ">
Manual Compact 功能
</a>
<a href="/zh/administration/usage-scenario"
class="navbar-item ">
Usage Scenario 功能
</a>
<a href="/zh/administration/bad-disk"
class="navbar-item ">
坏盘检修
</a>
<a href="/zh/administration/whitelist"
class="navbar-item ">
Replica Server 白名单
</a>
<a href="/zh/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/zh/administration/hotspot-detection"
class="navbar-item ">
热点检测
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/experiences"><strong>En</strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">运维经验</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<p>一个分布式系统的Meta Server管理工作包含周期巡检,监控报警,故障排查,接入审核等,通过这些手段来帮助服务稳定运行。</p>
<h1 id="周期巡检">周期巡检</h1>
<ul>
<li>可用性:正常时可用性会保持在 100%,发生节点故障等异常偶尔会有可用性低于 100% 的情况</li>
<li>IOPS:流量的突增可能导致服务稳定性受到影响,而流量的突降可能是服务已故障所致</li>
<li>读写延迟:读写操作的 P99 延迟可能有异常毛刺的情况,对 Pegasus 用户产生影响</li>
<li>系统资源使用:CPU、内存、磁盘的使用量,网络带宽及连接数是否出现暴涨、是否达到高水位线</li>
</ul>
<h1 id="监控报警">监控报警</h1>
<p>参考 <a href="/zh/administration/monitoring">可视化监控</a>.</p>
<h1 id="问题排查">问题排查</h1>
<p>使用 <a href="/zh/overview/shell">Shell 工具</a> 查看 Pegasus 系统状态:</p>
<ul>
<li>集群基础信息是否正常:<code class="language-plaintext highlighter-rouge">cluster_info</code>
<ul>
<li><code class="language-plaintext highlighter-rouge">meta_servers</code> 列表是否正常</li>
<li><code class="language-plaintext highlighter-rouge">meta_function_level</code> 是否是 <code class="language-plaintext highlighter-rouge">steady</code> 状态</li>
</ul>
</li>
<li>各 Table、各 Partition 是否健康:<code class="language-plaintext highlighter-rouge">ls -d</code>
<ul>
<li>Table 数量是否正常</li>
<li>所有 Table 的 <code class="language-plaintext highlighter-rouge">unhealthy</code> 分片数量是否都为 0</li>
</ul>
</li>
<li>各节点是否健康:<code class="language-plaintext highlighter-rouge">nodes -d</code>
<ul>
<li>所有节点都在列表中,且状态都是 <code class="language-plaintext highlighter-rouge">ALIVE</code></li>
<li>数据分布是否倾斜严重(即 <code class="language-plaintext highlighter-rouge">replica_count</code> 列或 <code class="language-plaintext highlighter-rouge">primary_count</code> 列数量不平均)。如果倾斜严重,可以选择集群流量比较小的时间段,使用 shell 工具命令 <code class="language-plaintext highlighter-rouge">set_meta_level</code> 设置为 <code class="language-plaintext highlighter-rouge">lively</code>,使其进行负载均衡调整。记得在调整完成后设置回 <code class="language-plaintext highlighter-rouge">steady</code> 状态
<blockquote>
<p>注意:对于延迟敏感的用户,负载均衡只能在必要的时候才进行,不要影响服务稳定性,在该过程中要密切观察集群状态</p>
</blockquote>
</li>
</ul>
</li>
<li>各节点的基本信息是否正常:<code class="language-plaintext highlighter-rouge">server_info</code>
<ul>
<li>每个 server 的版本是否正确</li>
<li>通过 start time 判断是否发生过重启</li>
</ul>
</li>
<li>各节点的 metrics 信息是否正常:<code class="language-plaintext highlighter-rouge">server_stat</code>
<ul>
<li>IOPS、读写延迟</li>
<li>内存使用量</li>
</ul>
</li>
<li>各表的 metrics 信息是否正常:<code class="language-plaintext highlighter-rouge">app_stat</code>
<ul>
<li>IOPS</li>
<li>存储用量</li>
</ul>
</li>
</ul>
<p>查看系统信息:
例如,检查服务器的 socket 连接数(其中 <code class="language-plaintext highlighter-rouge">34601</code> 为 MetaServer 的服务监听端口):</p>
<ul>
<li>在 Meta Server 所在服务器上使用 <code class="language-plaintext highlighter-rouge">netstat</code> 命令检查连接数:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> netstat <span class="nt">-na</span> | <span class="nb">grep</span> <span class="s1">'34601\&gt;'</span> | <span class="nb">grep </span>ESTABLISHED | <span class="nb">wc</span> <span class="nt">-l</span>
</code></pre></div> </div>
</li>
<li>检查与该服务器建立连接的远程节点,按照连接数排序:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code> netstat <span class="nt">-na</span> | <span class="nb">grep</span> <span class="s1">'34601\&gt;'</span> | <span class="nb">grep </span>ESTABLISHE | <span class="nb">awk</span> <span class="s1">'{print $5}'</span> | <span class="nb">sed</span> <span class="s1">'s/:.*//'</span> | <span class="nb">sort</span> | <span class="nb">uniq</span> <span class="nt">-c</span> | <span class="nb">sort</span> <span class="nt">-k1</span> <span class="nt">-n</span> <span class="nt">-r</span> | <span class="nb">head</span>
</code></pre></div> </div>
</li>
<li>如果连接数太多(例如单节点连接数超过 100),就需要进一步分析原因。</li>
</ul>
<h2 id="常见故障处理方法">常见故障处理方法</h2>
<ul>
<li>如果服务进程异常退出,需要登录到对应服务器上,检查原因:
<ul>
<li>查看 <code class="language-plaintext highlighter-rouge">dmesg</code><code class="language-plaintext highlighter-rouge">/var/log/messages</code> 确认进程退出原因</li>
<li>如果是 <code class="language-plaintext highlighter-rouge">Out of memory: Killed process xxx</code>:查看 Meta Server 或 Replica Server 的内存使用监控,分析是否有异常现象</li>
<li>如果是 <code class="language-plaintext highlighter-rouge">segfault at xxx</code>
<ul>
<li>查看 Meta Server 或 Replica Server 的标准错误输出日志和服务日志</li>
<li>检查是否有 coredump 文件生成,有则使用 <code class="language-plaintext highlighter-rouge">gdb</code> 分析;如果没有 coredump 文件,则按需设置系统和用户的 <code class="language-plaintext highlighter-rouge">ulimit</code></li>
</ul>
</li>
</ul>
</li>
<li>如果出故障服务器较多,可以考虑将设置 <code class="language-plaintext highlighter-rouge">set_meta_level</code> 置为 <code class="language-plaintext highlighter-rouge">freezed</code> 状态,避免服务雪崩</li>
<li>如果进程不断重启(异常退出,又被其他进程监控服务拉起),可以考虑临时停止进程监控服务自动地拉起 Pegasus 进程</li>
<li>如果无法远程登录(如 <code class="language-plaintext highlighter-rouge">ssh</code>)到该服务器,有可能是物理机发生宕机,请联系服务提供方处理</li>
</ul>
<h1 id="pegasus-服务接入审核">Pegasus 服务接入审核</h1>
<p>Pegasus 和多数数据库一样,以 <em></em> 为单位管理资源。作为 Pegasus 的管理员,在每个表接入时,需要了解表需要的资源量,以便分配合适的计算和存储资源。结合 Pegasus 的存储原理,优化 key-value 的 schema 设计,也有助于保证服务的稳定性。</p>
<p>可以收集分析以下信息:</p>
<ul>
<li>表名</li>
<li>读峰值(QPS)</li>
<li>读总量(条/天)</li>
<li>写峰值(QPS)</li>
<li>写总量(条/天)</li>
<li>key-value 设计模式(以此判断是否存在数据倾斜问题)</li>
<li>访问模式(判断是否存在热点读写问题)</li>
<li>单条数据平均大小(KB/条)</li>
<li>数据总量预估(GB)</li>
<li>增长预估(例如 6 个月 / 1 年 / 3 年的增长量)</li>
<li>读延迟需求(P99 延迟)</li>
<li>写延迟需求(P99 延迟)</li>
<li>IOPS 特征(例如全天均衡、平滑的波峰与低谷、定时的批量写入等)</li>
</ul>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
本页导航
</p>
<ul class="menu-list">
<li><a href="#周期巡检">周期巡检</a></li>
<li><a href="#监控报警">监控报警</a></li>
<li><a href="#问题排查">问题排查</a>
<ul>
<li><a href="#常见故障处理方法">常见故障处理方法</a></li>
</ul>
</li>
<li><a href="#pegasus-服务接入审核">Pegasus 服务接入审核</a></li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>