blob: cc4dddccf7f40719b9aad42237b148684f7b4f21 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Hotspot Detection</title>
<link rel="stylesheet" href="/assets/css/app.css">
<link rel="shortcut icon" href="/assets/images/favicon.ico">
<link rel="stylesheet" href="/assets/css/utilities.min.css">
<link rel="stylesheet" href="/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Hotspot Detection | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Hotspot Detection" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。" />
<meta property="og:description" content="功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。" />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T06:39:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Hotspot Detection" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T06:39:52+00:00","datePublished":"2024-04-22T06:39:52+00:00","description":"功能简介 Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 hashkey 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。","headline":"Hotspot Detection","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/hotspot-detection"},"url":"/administration/hotspot-detection"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">The Pegasus documentation</p>
<ul class="menu-list">
<li>
<a href="/docs/downloads"
class="">
Downloads
</a>
</li>
</ul>
<p class="menu-label">Building Pegasus</p>
<ul class="menu-list">
<li>
<a href="/docs/build/compile-by-docker"
class="">
Compile by docker (recommended)
</a>
</li>
<li>
<a href="/docs/build/compile-from-source"
class="">
Compile from source
</a>
</li>
</ul>
<p class="menu-label">Client Libs</p>
<ul class="menu-list">
<li>
<a href="/clients/java-client"
class="">
Java Client
</a>
</li>
<li>
<a href="/clients/cpp-client"
class="">
C++ Client
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang Client
</a>
</li>
<li>
<a href="/clients/python-client"
class="">
Python Client
</a>
</li>
<li>
<a href="/clients/node-client"
class="">
NodeJS Client
</a>
</li>
<li>
<a href="/clients/scala-client"
class="">
Scala Client
</a>
</li>
</ul>
<p class="menu-label">Tools</p>
<ul class="menu-list">
<li>
<a href="/docs/tools/shell"
class="">
Pegasus Shell
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
Admin CLI
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
Pegasus data access CLI
</a>
</li>
</ul>
<p class="menu-label">API</p>
<ul class="menu-list">
<li>
<a href="/api/ttl"
class="">
TTL(Time To Live)
</a>
</li>
<li>
<a href="/api/single-atomic"
class="">
Single-Atomic Operations
</a>
</li>
<li>
<a href="/api/redis"
class="">
Redis Adaption
</a>
</li>
<li>
<a href="/api/geo"
class="">
GEO Support
</a>
</li>
<li>
<a href="/api/http"
class="">
HTTP API
</a>
</li>
</ul>
<p class="menu-label">Admin</p>
<ul class="menu-list">
<li>
<a href="/administration/deployment"
class="">
Deployment
</a>
</li>
<li>
<a href="/administration/config"
class="">
Configurations
</a>
</li>
<li>
<a href="/administration/rebalance"
class="">
Rebalance
</a>
</li>
<li>
<a href="/administration/monitoring"
class="">
Monitoring
</a>
</li>
<li>
<a href="/administration/rolling-update"
class="">
Rolling Restart and Upgrade
</a>
</li>
<li>
<a href="/administration/scale-in-out"
class="">
Scale-in and Scale-out
</a>
</li>
<li>
<a href="/administration/resource-management"
class="">
Resource Management
</a>
</li>
<li>
<a href="/administration/cold-backup"
class="">
Cold Backup
</a>
</li>
<li>
<a href="/administration/meta-recovery"
class="">
Metadata Recovery
</a>
</li>
<li>
<a href="/administration/replica-recovery"
class="">
Replica Data Recovery
</a>
</li>
<li>
<a href="/administration/zk-migration"
class="">
Zookeeper Migration
</a>
</li>
<li>
<a href="/administration/table-migration"
class="">
Table Migration
</a>
</li>
<li>
<a href="/administration/table-soft-delete"
class="">
Table Soft-Delete
</a>
</li>
<li>
<a href="/administration/table-env"
class="">
Table Environment Variables
</a>
</li>
<li>
<a href="/administration/remote-commands"
class="">
Remote Command
</a>
</li>
<li>
<a href="/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/administration/duplication"
class="">
Duplication
</a>
</li>
<li>
<a href="/administration/compression"
class="">
Data Compression
</a>
</li>
<li>
<a href="/administration/throttling"
class="">
Throttling
</a>
</li>
<li>
<a href="/administration/experiences"
class="">
Experiences
</a>
</li>
<li>
<a href="/administration/manual-compact"
class="">
Manual Compact
</a>
</li>
<li>
<a href="/administration/usage-scenario"
class="">
Usage Scenario
</a>
</li>
<li>
<a href="/administration/bad-disk"
class="">
Bad Disk Repair
</a>
</li>
<li>
<a href="/administration/whitelist"
class="">
Replica Server Whitelist
</a>
</li>
<li>
<a href="/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/administration/hotspot-detection"
class="is-active">
Hotspot Detection
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/hotspot-detection"><strong></strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
The Pegasus documentation
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/downloads"
class="navbar-item ">
Downloads
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Building Pegasus
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/build/compile-by-docker"
class="navbar-item ">
Compile by docker (recommended)
</a>
<a href="/docs/build/compile-from-source"
class="navbar-item ">
Compile from source
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Client Libs
</span>
</a>
<div class="navbar-dropdown">
<a href="/clients/java-client"
class="navbar-item ">
Java Client
</a>
<a href="/clients/cpp-client"
class="navbar-item ">
C++ Client
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang Client
</a>
<a href="/clients/python-client"
class="navbar-item ">
Python Client
</a>
<a href="/clients/node-client"
class="navbar-item ">
NodeJS Client
</a>
<a href="/clients/scala-client"
class="navbar-item ">
Scala Client
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Tools
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/tools/shell"
class="navbar-item ">
Pegasus Shell
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
Admin CLI
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
Pegasus data access CLI
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
API
</span>
</a>
<div class="navbar-dropdown">
<a href="/api/ttl"
class="navbar-item ">
TTL(Time To Live)
</a>
<a href="/api/single-atomic"
class="navbar-item ">
Single-Atomic Operations
</a>
<a href="/api/redis"
class="navbar-item ">
Redis Adaption
</a>
<a href="/api/geo"
class="navbar-item ">
GEO Support
</a>
<a href="/api/http"
class="navbar-item ">
HTTP API
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Admin
</span>
</a>
<div class="navbar-dropdown">
<a href="/administration/deployment"
class="navbar-item ">
Deployment
</a>
<a href="/administration/config"
class="navbar-item ">
Configurations
</a>
<a href="/administration/rebalance"
class="navbar-item ">
Rebalance
</a>
<a href="/administration/monitoring"
class="navbar-item ">
Monitoring
</a>
<a href="/administration/rolling-update"
class="navbar-item ">
Rolling Restart and Upgrade
</a>
<a href="/administration/scale-in-out"
class="navbar-item ">
Scale-in and Scale-out
</a>
<a href="/administration/resource-management"
class="navbar-item ">
Resource Management
</a>
<a href="/administration/cold-backup"
class="navbar-item ">
Cold Backup
</a>
<a href="/administration/meta-recovery"
class="navbar-item ">
Metadata Recovery
</a>
<a href="/administration/replica-recovery"
class="navbar-item ">
Replica Data Recovery
</a>
<a href="/administration/zk-migration"
class="navbar-item ">
Zookeeper Migration
</a>
<a href="/administration/table-migration"
class="navbar-item ">
Table Migration
</a>
<a href="/administration/table-soft-delete"
class="navbar-item ">
Table Soft-Delete
</a>
<a href="/administration/table-env"
class="navbar-item ">
Table Environment Variables
</a>
<a href="/administration/remote-commands"
class="navbar-item ">
Remote Command
</a>
<a href="/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/administration/duplication"
class="navbar-item ">
Duplication
</a>
<a href="/administration/compression"
class="navbar-item ">
Data Compression
</a>
<a href="/administration/throttling"
class="navbar-item ">
Throttling
</a>
<a href="/administration/experiences"
class="navbar-item ">
Experiences
</a>
<a href="/administration/manual-compact"
class="navbar-item ">
Manual Compact
</a>
<a href="/administration/usage-scenario"
class="navbar-item ">
Usage Scenario
</a>
<a href="/administration/bad-disk"
class="navbar-item ">
Bad Disk Repair
</a>
<a href="/administration/whitelist"
class="navbar-item ">
Replica Server Whitelist
</a>
<a href="/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/administration/hotspot-detection"
class="navbar-item is-active">
Hotspot Detection
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/hotspot-detection"><strong></strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">Hotspot Detection</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<h1 id="功能简介">功能简介</h1>
<p>Pegasus 是一个以 hash 分片打散数据的分布式存储系统。通常情况下,流量会均匀地打在集群中的所有节点上。但是在极端情况下,比如 <code class="language-plaintext highlighter-rouge">hashkey</code> 设计不合理、出现了热点事件/热点用户、业务代码逻辑错误等场景下,Pegasus 单机节点往往会负载过高从而影响服务整体的可用性。于是我们设计了一套热点检测方案帮助运维人员能及时发现热点问题并找出热点流量。</p>
<h1 id="热点分片查询">热点分片查询</h1>
<h2 id="设计原理">设计原理</h2>
<p>Collector 周期性从集群拿到的各个分片的读写流量进行数据分析,对每个分片通过纵向的历史数据和横向同期数据对比,计算 <a href="https://en.wikipedia.org/wiki/Standard_score">Z-score</a> 用来描述分片的热点情况。在开启 <code class="language-plaintext highlighter-rouge">enable_hotkey_auto_detect</code> 选项后,Collector 会自动向热点分片发送 <a href="#热点流量查询">热点流量查询</a> 请求,统计当前异常的热点流量。</p>
<h2 id="操作示例">操作示例</h2>
<p>在配置文件中添加以下几个配置项,然后重启 Collector:</p>
<div class="language-shell highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">[</span>pegasus.collector]
<span class="c"># 开启热点流量自动检测功能,当热点分片被确认之后,</span>
<span class="c"># Collector 会向对应的分片发送热点流量查询请求</span>
enable_hotkey_auto_detect <span class="o">=</span> <span class="nb">true</span>
<span class="c"># 热点分片阈值(Z-score)为 3。在这里可以理解为算法的灵敏度,</span>
<span class="c"># 超过阈值的会被判定成热点分片。</span>
<span class="c"># 在测试中,我们认为阈值设为 3 为比较合理的选项。</span>
hot_partition_threshold <span class="o">=</span> 3
<span class="c"># 单个分片被判定为热点的累积次数超过这个值就会触发热点流量自动检测。</span>
occurrence_threshold <span class="o">=</span> 100
</code></pre></div></div>
<h2 id="相关监控">相关监控</h2>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>app.stat.hotspots@{app_name}.{hotkey_type}.{partition_count}
</code></pre></div></div>
<p>hotkey_type 分为 <code class="language-plaintext highlighter-rouge">read</code><code class="language-plaintext highlighter-rouge">write</code> 分别代表读/写热点</p>
<h1 id="热点流量查询">热点流量查询</h1>
<h2 id="设计原理-1">设计原理</h2>
<p>在 replica 收到对应分片的热点流量查询请求后,会记录统计一段时间的流量,从而分析出具体的热点流量。如果周期时间内找不到热点流量,收集会自动停止。</p>
<h2 id="操作示例-1">操作示例</h2>
<p><strong>开启热点流量检测</strong></p>
<p>你需要在命令行中添加探测表的 app_id、分片号、热点数据类型、需要探测的节点地址</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 1 -t write -c start -d 10.231.57.104:34802
Detect hotkey rpc is starting, use 'detect_hotkey -a 3 -p 1 -t write -c query -d 10.231.57.104:34802' to get the result later
</code></pre></div></div>
<p><strong>查询热点流量结果</strong></p>
<p>当热点流量检测未结束时,会受到如下提示</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 1 -t write -c query -d 10.231.57.104:34802
Hotkey detect rpc performed failed, in 3.1, error_hint:ERR_BUSY Can't get hotkey now, now state: hotkey_collector_state::COARSE_DETECTING
</code></pre></div></div>
<p>成功获取到热点流量 <code class="language-plaintext highlighter-rouge">hashkey = Thisishotkey1</code>,后的结果</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c query -d 10.231.57.104:34802
Find write hotkey in 3.2 result:\"Thisishotkey1\"
</code></pre></div></div>
<p>周期内无法检测到热点流量的结果</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c query -d 10.231.57.104:34803
Hotkey detect rpc performed failed, in 3.2, error_hint:ERR_BUSY Can't get hotkey now, now state: hotkey_collector_state::STOPPED
</code></pre></div></div>
<p><strong>结束热点流量检测</strong></p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&gt;&gt;&gt; detect_hotkey -a 3 -p 2 -t write -c stop -d 10.231.57.104:34803
Detect hotkey rpc is stopped now
</code></pre></div></div>
<p>无论是检测成功还是检测失败都要先 stop 这次探测才能开始下一次探测</p>
<h2 id="相关配置">相关配置</h2>
<div class="language-shell highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="o">[</span>pegasus.server]
<span class="c"># 粗粒度筛查热点流量的阈值,灵敏度负相关</span>
hot_key_variance_threshold <span class="o">=</span> 5
<span class="c"># 细粒度筛查热点流量的阈值,灵敏度负相关</span>
hot_bucket_variance_threshold <span class="o">=</span> 7
<span class="c"># 设置为负数,一般不推荐改动</span>
hotkey_buckets_num <span class="o">=</span> 37
<span class="c"># 一次探测最长时间</span>
max_seconds_to_detect_hotkey <span class="o">=</span> 150
<span class="c"># 单次探测收集时间周期</span>
hotkey_analyse_time_interval_s <span class="o">=</span> 10
</code></pre></div></div>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
Table of contents
</p>
<ul class="menu-list">
<li><a href="#功能简介">功能简介</a></li>
<li><a href="#热点分片查询">热点分片查询</a>
<ul>
<li><a href="#设计原理">设计原理</a></li>
<li><a href="#操作示例">操作示例</a></li>
<li><a href="#相关监控">相关监控</a></li>
</ul>
</li>
<li><a href="#热点流量查询">热点流量查询</a>
<ul>
<li><a href="#设计原理-1">设计原理</a></li>
<li><a href="#操作示例-1">操作示例</a></li>
<li><a href="#相关配置">相关配置</a></li>
</ul>
</li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>