blob: dc10d597e561def4bb299061e03d64874acce4d1 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Hotspot Detection</title>
<link rel="stylesheet" href="/zh/assets/css/app.css">
<link rel="shortcut icon" href="/zh/assets/images/favicon.ico">
<link rel="stylesheet" href="/zh/assets/css/utilities.min.css">
<link rel="stylesheet" href="/zh/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Hotspot Detection | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Hotspot Detection" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。" />
<meta property="og:description" content="功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T13:02:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Hotspot Detection" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T13:02:52+00:00","datePublished":"2024-04-22T13:02:52+00:00","description":"功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。","headline":"Hotspot Detection","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/hotspot-detection"},"url":"/administration/hotspot-detection"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/zh/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">Pegasus 产品文档</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/downloads"
class="">
下载
</a>
</li>
</ul>
<p class="menu-label">编译构建</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/build/compile-by-docker"
class="">
使用 Docker 完成编译(推荐)
</a>
</li>
<li>
<a href="/zh/docs/build/compile-from-source"
class="">
从源码编译
</a>
</li>
</ul>
<p class="menu-label">客户端库</p>
<ul class="menu-list">
<li>
<a href="/zh/clients/java-client"
class="">
Java 客户端
</a>
</li>
<li>
<a href="/zh/clients/cpp-client"
class="">
C++ 客户端
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang 客户端
</a>
</li>
<li>
<a href="/zh/clients/python-client"
class="">
Python 客户端
</a>
</li>
<li>
<a href="/zh/clients/node-client"
class="">
NodeJS 客户端
</a>
</li>
<li>
<a href="/zh/clients/scala-client"
class="">
Scala 客户端
</a>
</li>
</ul>
<p class="menu-label">生态工具</p>
<ul class="menu-list">
<li>
<a href="/zh/docs/tools/shell"
class="">
Pegasus Shell 工具
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
集群管理命令行
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
数据访问命令行
</a>
</li>
</ul>
<p class="menu-label">用户接口</p>
<ul class="menu-list">
<li>
<a href="/zh/api/ttl"
class="">
TTL
</a>
</li>
<li>
<a href="/zh/api/single-atomic"
class="">
单行原子操作
</a>
</li>
<li>
<a href="/zh/api/redis"
class="">
Redis 适配
</a>
</li>
<li>
<a href="/zh/api/geo"
class="">
GEO 支持
</a>
</li>
<li>
<a href="/zh/api/http"
class="">
HTTP 接口
</a>
</li>
</ul>
<p class="menu-label">高效运维</p>
<ul class="menu-list">
<li>
<a href="/zh/administration/deployment"
class="">
集群部署
</a>
</li>
<li>
<a href="/zh/administration/config"
class="">
配置说明
</a>
</li>
<li>
<a href="/zh/administration/rebalance"
class="">
负载均衡
</a>
</li>
<li>
<a href="/zh/administration/monitoring"
class="">
可视化监控
</a>
</li>
<li>
<a href="/zh/administration/rolling-update"
class="">
集群重启和升级
</a>
</li>
<li>
<a href="/zh/administration/scale-in-out"
class="">
集群扩容缩容
</a>
</li>
<li>
<a href="/zh/administration/resource-management"
class="">
资源管理
</a>
</li>
<li>
<a href="/zh/administration/cold-backup"
class="">
冷备份
</a>
</li>
<li>
<a href="/zh/administration/meta-recovery"
class="">
元数据恢复
</a>
</li>
<li>
<a href="/zh/administration/replica-recovery"
class="">
Replica 数据恢复
</a>
</li>
<li>
<a href="/zh/administration/zk-migration"
class="">
Zookeeper 迁移
</a>
</li>
<li>
<a href="/zh/administration/table-migration"
class="">
Table 迁移
</a>
</li>
<li>
<a href="/zh/administration/table-soft-delete"
class="">
Table 软删除
</a>
</li>
<li>
<a href="/zh/administration/table-env"
class="">
Table 环境变量
</a>
</li>
<li>
<a href="/zh/administration/remote-commands"
class="">
远程命令
</a>
</li>
<li>
<a href="/zh/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/zh/administration/duplication"
class="">
跨机房同步
</a>
</li>
<li>
<a href="/zh/administration/compression"
class="">
数据压缩
</a>
</li>
<li>
<a href="/zh/administration/throttling"
class="">
流量控制
</a>
</li>
<li>
<a href="/zh/administration/experiences"
class="">
运维经验
</a>
</li>
<li>
<a href="/zh/administration/manual-compact"
class="">
Manual Compact 功能
</a>
</li>
<li>
<a href="/zh/administration/usage-scenario"
class="">
Usage Scenario 功能
</a>
</li>
<li>
<a href="/zh/administration/bad-disk"
class="">
坏盘检修
</a>
</li>
<li>
<a href="/zh/administration/whitelist"
class="">
Replica Server 白名单
</a>
</li>
<li>
<a href="/zh/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/zh/administration/hotspot-detection"
class="is-active">
热点检测
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/zh/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/hotspot-detection"><strong>En</strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Pegasus 产品文档
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/downloads"
class="navbar-item ">
下载
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
编译构建
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/build/compile-by-docker"
class="navbar-item ">
使用 Docker 完成编译(推荐)
</a>
<a href="/zh/docs/build/compile-from-source"
class="navbar-item ">
从源码编译
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
客户端库
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/clients/java-client"
class="navbar-item ">
Java 客户端
</a>
<a href="/zh/clients/cpp-client"
class="navbar-item ">
C++ 客户端
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang 客户端
</a>
<a href="/zh/clients/python-client"
class="navbar-item ">
Python 客户端
</a>
<a href="/zh/clients/node-client"
class="navbar-item ">
NodeJS 客户端
</a>
<a href="/zh/clients/scala-client"
class="navbar-item ">
Scala 客户端
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
生态工具
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/docs/tools/shell"
class="navbar-item ">
Pegasus Shell 工具
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
集群管理命令行
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
数据访问命令行
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
用户接口
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/api/ttl"
class="navbar-item ">
TTL
</a>
<a href="/zh/api/single-atomic"
class="navbar-item ">
单行原子操作
</a>
<a href="/zh/api/redis"
class="navbar-item ">
Redis 适配
</a>
<a href="/zh/api/geo"
class="navbar-item ">
GEO 支持
</a>
<a href="/zh/api/http"
class="navbar-item ">
HTTP 接口
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
高效运维
</span>
</a>
<div class="navbar-dropdown">
<a href="/zh/administration/deployment"
class="navbar-item ">
集群部署
</a>
<a href="/zh/administration/config"
class="navbar-item ">
配置说明
</a>
<a href="/zh/administration/rebalance"
class="navbar-item ">
负载均衡
</a>
<a href="/zh/administration/monitoring"
class="navbar-item ">
可视化监控
</a>
<a href="/zh/administration/rolling-update"
class="navbar-item ">
集群重启和升级
</a>
<a href="/zh/administration/scale-in-out"
class="navbar-item ">
集群扩容缩容
</a>
<a href="/zh/administration/resource-management"
class="navbar-item ">
资源管理
</a>
<a href="/zh/administration/cold-backup"
class="navbar-item ">
冷备份
</a>
<a href="/zh/administration/meta-recovery"
class="navbar-item ">
元数据恢复
</a>
<a href="/zh/administration/replica-recovery"
class="navbar-item ">
Replica 数据恢复
</a>
<a href="/zh/administration/zk-migration"
class="navbar-item ">
Zookeeper 迁移
</a>
<a href="/zh/administration/table-migration"
class="navbar-item ">
Table 迁移
</a>
<a href="/zh/administration/table-soft-delete"
class="navbar-item ">
Table 软删除
</a>
<a href="/zh/administration/table-env"
class="navbar-item ">
Table 环境变量
</a>
<a href="/zh/administration/remote-commands"
class="navbar-item ">
远程命令
</a>
<a href="/zh/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/zh/administration/duplication"
class="navbar-item ">
跨机房同步
</a>
<a href="/zh/administration/compression"
class="navbar-item ">
数据压缩
</a>
<a href="/zh/administration/throttling"
class="navbar-item ">
流量控制
</a>
<a href="/zh/administration/experiences"
class="navbar-item ">
运维经验
</a>
<a href="/zh/administration/manual-compact"
class="navbar-item ">
Manual Compact 功能
</a>
<a href="/zh/administration/usage-scenario"
class="navbar-item ">
Usage Scenario 功能
</a>
<a href="/zh/administration/bad-disk"
class="navbar-item ">
坏盘检修
</a>
<a href="/zh/administration/whitelist"
class="navbar-item ">
Replica Server 白名单
</a>
<a href="/zh/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/zh/administration/hotspot-detection"
class="navbar-item is-active">
热点检测
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<!--If you don't want a url to be relativized, you can add a space explicitly into the href to
prevents a url from being relativized by polyglot.-->
<a class="button is-light is-outlined is-inverted" href=" /administration/hotspot-detection"><strong>En</strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">热点检测</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="功能简介">功能简介</h1>
<p>Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 <code class="language-plaintext highlighter-rouge">hashkey</code> 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。</p>
<h1 id="热点分片查询">热点分片查询</h1>
<h2 id="设计原理">设计原理</h2>
<p>Collector 周期性从集群拿到的各个分片的读写流量进行数据分析,对每个分片通过纵向的历史数据和横向同期数据对比,计算 <a href="https://en.wikipedia.org/wiki/Standard_score">Z-score</a> 用来描述分片的热点情况。在开启 <code class="language-plaintext highlighter-rouge">enable_hotkey_auto_detect</code> 选项后,Collector 会自动向热点分片发送 <a href="#热点流量查询">热点流量查询</a> 请求,统计当前异常的热点流量。</p>
<h2 id="操作示例">操作示例</h2>
<p>在配置文件中添加以下几个配置项,然后重启 Collector:</p>
<div class="language-shell highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">[</span>pegasus.collector]
<span class="c"># 开启热点流量自动检测功能,当热点分片被确认之后,</span>
<span class="c"># Collector 会向对应的分片发送热点流量查询请求</span>
enable_hotkey_auto_detect <span class="o">=</span> <span class="nb">true</span>
<span class="c"># 热点分片阈值(Z-score)为 3。在这里可以理解为算法的灵敏度,</span>
<span class="c"># 超过阈值的会被判定成热点分片。</span>
<span class="c"># 在测试中,我们认为阈值设为 3 为比较合理的选项。</span>
hot_partition_threshold <span class="o">=</span> 3
<span class="c"># 单个分片被判定为热点的累积次数超过这个值就会触发热点流量自动检测。</span>
occurrence_threshold <span class="o">=</span> 100
</code></pre></div></div>
<h2 id="相关监控">相关监控</h2>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>app.stat.hotspots@{app_name}.{hotkey_type}.{partition_count}
</code></pre></div></div>
<p>hotkey_type 分为 <code class="language-plaintext highlighter-rouge">read</code><code class="language-plaintext highlighter-rouge">write</code> 分别代表读/写热点</p>
<h1 id="热点流量查询">热点流量查询</h1>
<h2 id="设计原理-1">设计原理</h2>
<p>在 replica 收到对应分片的热点流量查询请求后,会记录统计一段时间的流量,从而分析出具体的热点流量。如果周期时间内找不到热点流量,收集会自动停止。</p>
<h2 id="操作示例-1">操作示例</h2>
<p><strong>开启热点流量检测</strong></p>
<p>你需要在命令行中添加探测表的 app_id、分片号、热点数据类型、需要探测的节点地址</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 1 -t write -c start -d 10.231.57.104:34802
Detect hotkey rpc is starting, use 'detect_hotkey -a 3 -p 1 -t write -c query -d 10.231.57.104:34802' to get the result later
</code></pre></div></div>
<p><strong>查询热点流量结果</strong></p>
<p>当热点流量检测未结束时,会受到如下提示</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 1 -t write -c query -d 10.231.57.104:34802
Hotkey detect rpc performed failed, in 3.1, error_hint:ERR_BUSY Can't get hotkey now, now state: hotkey_collector_state::COARSE_DETECTING
</code></pre></div></div>
<p>成功获取到热点流量 <code class="language-plaintext highlighter-rouge">hashkey = Thisishotkey1</code>,后的结果</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c query -d 10.231.57.104:34802
Find write hotkey in 3.2 result:\"Thisishotkey1\"
</code></pre></div></div>
<p>周期内无法检测到热点流量的结果</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c query -d 10.231.57.104:34803
Hotkey detect rpc performed failed, in 3.2, error_hint:ERR_BUSY Can't get hotkey now, now state: hotkey_collector_state::STOPPED
</code></pre></div></div>
<p><strong>结束热点流量检测</strong></p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c stop -d 10.231.57.104:34803
Detect hotkey rpc is stopped now
</code></pre></div></div>
<p>无论是检测成功还是检测失败都要先 stop 这次探测才能开始下一次探测</p>
<h2 id="相关配置">相关配置</h2>
<div class="language-shell highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">[</span>pegasus.server]
<span class="c"># 粗粒度筛查热点流量的阈值,灵敏度负相关</span>
hot_key_variance_threshold <span class="o">=</span> 5
<span class="c"># 细粒度筛查热点流量的阈值,灵敏度负相关</span>
hot_bucket_variance_threshold <span class="o">=</span> 7
<span class="c"># 设置为负数,一般不推荐改动</span>
hotkey_buckets_num <span class="o">=</span> 37
<span class="c"># 一次探测最长时间</span>
max_seconds_to_detect_hotkey <span class="o">=</span> 150
<span class="c"># 单次探测收集时间周期</span>
hotkey_analyse_time_interval_s <span class="o">=</span> 10
</code></pre></div></div>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
本页导航
</p>
<ul class="menu-list">
<li><a href="#功能简介">功能简介</a></li>
<li><a href="#热点分片查询">热点分片查询</a>
<ul>
<li><a href="#设计原理">设计原理</a></li>
<li><a href="#操作示例">操作示例</a></li>
<li><a href="#相关监控">相关监控</a></li>
</ul>
</li>
<li><a href="#热点流量查询">热点流量查询</a>
<ul>
<li><a href="#设计原理-1">设计原理</a></li>
<li><a href="#操作示例-1">操作示例</a></li>
<li><a href="#相关配置">相关配置</a></li>
</ul>
</li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>