blob: 8d7288706cfcc078067ecdda09fc99ff5a136904 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Pegasus | Experiences</title>
<link rel="stylesheet" href="/assets/css/app.css">
<link rel="shortcut icon" href="/assets/images/favicon.ico">
<link rel="stylesheet" href="/assets/css/utilities.min.css">
<link rel="stylesheet" href="/assets/css/docsearch.v3.css">
<script src="/assets/js/jquery.min.js"></script>
<script src="/assets/js/all.min.js"></script>
<script src="/assets/js/docsearch.v3.js"></script>
<!-- Begin Jekyll SEO tag v2.8.0 -->
<title>Experiences | Pegasus</title>
<meta name="generator" content="Jekyll v4.3.3" />
<meta property="og:title" content="Experiences" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="The administrator work of a distributed system includes periodic inspections, monitoring &amp; alarms, troubleshooting, access auditing, etc. to help ensure the stability of service." />
<meta property="og:description" content="The administrator work of a distributed system includes periodic inspections, monitoring &amp; alarms, troubleshooting, access auditing, etc. to help ensure the stability of service." />
<meta property="og:site_name" content="Pegasus" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2024-04-22T13:02:52+00:00" />
<meta name="twitter:card" content="summary" />
<meta property="twitter:title" content="Experiences" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"BlogPosting","dateModified":"2024-04-22T13:02:52+00:00","datePublished":"2024-04-22T13:02:52+00:00","description":"The administrator work of a distributed system includes periodic inspections, monitoring &amp; alarms, troubleshooting, access auditing, etc. to help ensure the stability of service.","headline":"Experiences","mainEntityOfPage":{"@type":"WebPage","@id":"/administration/experiences"},"url":"/administration/experiences"}</script>
<!-- End Jekyll SEO tag -->
</head>
<body>
<div class="dashboard is-full-height">
<!-- left panel -->
<div class="dashboard-panel is-medium is-hidden-mobile pl-0">
<div class="dashboard-panel-header has-text-centered">
<a href="/">
<img src="/assets/images/pegasus-logo-inv.png" style="width: 80%;">
</a>
</div>
<div class="dashboard-panel-main is-scrollable pl-6">
<aside class="menu">
<p class="menu-label">The Pegasus documentation</p>
<ul class="menu-list">
<li>
<a href="/docs/downloads"
class="">
Downloads
</a>
</li>
</ul>
<p class="menu-label">Building Pegasus</p>
<ul class="menu-list">
<li>
<a href="/docs/build/compile-by-docker"
class="">
Compile by docker (recommended)
</a>
</li>
<li>
<a href="/docs/build/compile-from-source"
class="">
Compile from source
</a>
</li>
</ul>
<p class="menu-label">Client Libs</p>
<ul class="menu-list">
<li>
<a href="/clients/java-client"
class="">
Java Client
</a>
</li>
<li>
<a href="/clients/cpp-client"
class="">
C++ Client
</a>
</li>
<li>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="">
Golang Client
</a>
</li>
<li>
<a href="/clients/python-client"
class="">
Python Client
</a>
</li>
<li>
<a href="/clients/node-client"
class="">
NodeJS Client
</a>
</li>
<li>
<a href="/clients/scala-client"
class="">
Scala Client
</a>
</li>
</ul>
<p class="menu-label">Tools</p>
<ul class="menu-list">
<li>
<a href="/docs/tools/shell"
class="">
Pegasus Shell
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/admin-cli"
class="">
Admin CLI
</a>
</li>
<li>
<a href="https://github.com/pegasus-kv/pegic"
class="">
Pegasus data access CLI
</a>
</li>
</ul>
<p class="menu-label">API</p>
<ul class="menu-list">
<li>
<a href="/api/ttl"
class="">
TTL(Time To Live)
</a>
</li>
<li>
<a href="/api/single-atomic"
class="">
Single-Atomic Operations
</a>
</li>
<li>
<a href="/api/redis"
class="">
Redis Adaption
</a>
</li>
<li>
<a href="/api/geo"
class="">
GEO Support
</a>
</li>
<li>
<a href="/api/http"
class="">
HTTP API
</a>
</li>
</ul>
<p class="menu-label">Admin</p>
<ul class="menu-list">
<li>
<a href="/administration/deployment"
class="">
Deployment
</a>
</li>
<li>
<a href="/administration/config"
class="">
Configurations
</a>
</li>
<li>
<a href="/administration/rebalance"
class="">
Rebalance
</a>
</li>
<li>
<a href="/administration/monitoring"
class="">
Monitoring
</a>
</li>
<li>
<a href="/administration/rolling-update"
class="">
Rolling Restart and Upgrade
</a>
</li>
<li>
<a href="/administration/scale-in-out"
class="">
Scale-in and Scale-out
</a>
</li>
<li>
<a href="/administration/resource-management"
class="">
Resource Management
</a>
</li>
<li>
<a href="/administration/cold-backup"
class="">
Cold Backup
</a>
</li>
<li>
<a href="/administration/meta-recovery"
class="">
Metadata Recovery
</a>
</li>
<li>
<a href="/administration/replica-recovery"
class="">
Replica Data Recovery
</a>
</li>
<li>
<a href="/administration/zk-migration"
class="">
Zookeeper Migration
</a>
</li>
<li>
<a href="/administration/table-migration"
class="">
Table Migration
</a>
</li>
<li>
<a href="/administration/table-soft-delete"
class="">
Table Soft-Delete
</a>
</li>
<li>
<a href="/administration/table-env"
class="">
Table Environment Variables
</a>
</li>
<li>
<a href="/administration/remote-commands"
class="">
Remote Command
</a>
</li>
<li>
<a href="/administration/partition-split"
class="">
Partition-Split
</a>
</li>
<li>
<a href="/administration/duplication"
class="">
Duplication
</a>
</li>
<li>
<a href="/administration/compression"
class="">
Data Compression
</a>
</li>
<li>
<a href="/administration/throttling"
class="">
Throttling
</a>
</li>
<li>
<a href="/administration/experiences"
class="is-active">
Experiences
</a>
</li>
<li>
<a href="/administration/manual-compact"
class="">
Manual Compact
</a>
</li>
<li>
<a href="/administration/usage-scenario"
class="">
Usage Scenario
</a>
</li>
<li>
<a href="/administration/bad-disk"
class="">
Bad Disk Repair
</a>
</li>
<li>
<a href="/administration/whitelist"
class="">
Replica Server Whitelist
</a>
</li>
<li>
<a href="/administration/backup-request"
class="">
Backup Request
</a>
</li>
<li>
<a href="/administration/hotspot-detection"
class="">
Hotspot Detection
</a>
</li>
</ul>
</aside>
</div>
</div>
<!-- main section -->
<div class="dashboard-main is-scrollable">
<nav class="navbar is-hidden-desktop">
<div class="navbar-brand">
<a href="/" class="navbar-item">
<!-- Pegasus Icon -->
<img src="/assets/images/pegasus-square.png">
</a>
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/experiences"><strong></strong></a>
</div>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navMenu">
<!-- Appears in mobile mode only -->
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navMenu">
<div class="navbar-end">
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
The Pegasus documentation
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/downloads"
class="navbar-item ">
Downloads
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Building Pegasus
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/build/compile-by-docker"
class="navbar-item ">
Compile by docker (recommended)
</a>
<a href="/docs/build/compile-from-source"
class="navbar-item ">
Compile from source
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Client Libs
</span>
</a>
<div class="navbar-dropdown">
<a href="/clients/java-client"
class="navbar-item ">
Java Client
</a>
<a href="/clients/cpp-client"
class="navbar-item ">
C++ Client
</a>
<a href="https://github.com/apache/incubator-pegasus/tree/master/go-client"
class="navbar-item ">
Golang Client
</a>
<a href="/clients/python-client"
class="navbar-item ">
Python Client
</a>
<a href="/clients/node-client"
class="navbar-item ">
NodeJS Client
</a>
<a href="/clients/scala-client"
class="navbar-item ">
Scala Client
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Tools
</span>
</a>
<div class="navbar-dropdown">
<a href="/docs/tools/shell"
class="navbar-item ">
Pegasus Shell
</a>
<a href="https://github.com/pegasus-kv/admin-cli"
class="navbar-item ">
Admin CLI
</a>
<a href="https://github.com/pegasus-kv/pegic"
class="navbar-item ">
Pegasus data access CLI
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
API
</span>
</a>
<div class="navbar-dropdown">
<a href="/api/ttl"
class="navbar-item ">
TTL(Time To Live)
</a>
<a href="/api/single-atomic"
class="navbar-item ">
Single-Atomic Operations
</a>
<a href="/api/redis"
class="navbar-item ">
Redis Adaption
</a>
<a href="/api/geo"
class="navbar-item ">
GEO Support
</a>
<a href="/api/http"
class="navbar-item ">
HTTP API
</a>
</div>
</div>
<!--dropdown-->
<div class="navbar-item has-dropdown is-hoverable">
<a href=""
class="navbar-link ">
<span>
Admin
</span>
</a>
<div class="navbar-dropdown">
<a href="/administration/deployment"
class="navbar-item ">
Deployment
</a>
<a href="/administration/config"
class="navbar-item ">
Configurations
</a>
<a href="/administration/rebalance"
class="navbar-item ">
Rebalance
</a>
<a href="/administration/monitoring"
class="navbar-item ">
Monitoring
</a>
<a href="/administration/rolling-update"
class="navbar-item ">
Rolling Restart and Upgrade
</a>
<a href="/administration/scale-in-out"
class="navbar-item ">
Scale-in and Scale-out
</a>
<a href="/administration/resource-management"
class="navbar-item ">
Resource Management
</a>
<a href="/administration/cold-backup"
class="navbar-item ">
Cold Backup
</a>
<a href="/administration/meta-recovery"
class="navbar-item ">
Metadata Recovery
</a>
<a href="/administration/replica-recovery"
class="navbar-item ">
Replica Data Recovery
</a>
<a href="/administration/zk-migration"
class="navbar-item ">
Zookeeper Migration
</a>
<a href="/administration/table-migration"
class="navbar-item ">
Table Migration
</a>
<a href="/administration/table-soft-delete"
class="navbar-item ">
Table Soft-Delete
</a>
<a href="/administration/table-env"
class="navbar-item ">
Table Environment Variables
</a>
<a href="/administration/remote-commands"
class="navbar-item ">
Remote Command
</a>
<a href="/administration/partition-split"
class="navbar-item ">
Partition-Split
</a>
<a href="/administration/duplication"
class="navbar-item ">
Duplication
</a>
<a href="/administration/compression"
class="navbar-item ">
Data Compression
</a>
<a href="/administration/throttling"
class="navbar-item ">
Throttling
</a>
<a href="/administration/experiences"
class="navbar-item is-active">
Experiences
</a>
<a href="/administration/manual-compact"
class="navbar-item ">
Manual Compact
</a>
<a href="/administration/usage-scenario"
class="navbar-item ">
Usage Scenario
</a>
<a href="/administration/bad-disk"
class="navbar-item ">
Bad Disk Repair
</a>
<a href="/administration/whitelist"
class="navbar-item ">
Replica Server Whitelist
</a>
<a href="/administration/backup-request"
class="navbar-item ">
Backup Request
</a>
<a href="/administration/hotspot-detection"
class="navbar-item ">
Hotspot Detection
</a>
</div>
</div>
</div>
</div>
</nav>
<nav class="navbar is-hidden-mobile">
<div class="navbar-start w-full">
<div class="navbar-item pl-0 w-full">
<!--TODO(wutao): Given the limitation of docsearch that couldn't handle multiple input,
I make searchbox only shown in desktop. Fix this issue when docsearch.js v3 released.
Related issue: https://github.com/algolia/docsearch/issues/230-->
<div id="docsearch"></div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<!--A simple language switch button that only supports zh and en.-->
<!--IF its language is zh, then switches to en.-->
<a class="button is-light is-outlined is-inverted" href="/zh/administration/experiences"><strong></strong></a>
</div>
</div>
</nav>
<section class="hero is-info lg:mr-3">
<div class="hero-body">
<p class="title is-size-2 is-centered">Experiences</p>
</div>
</section>
<section class="section" style="padding-top: 2rem;">
<div class="content">
<p>The administrator work of a distributed system includes periodic inspections, monitoring &amp; alarms, troubleshooting, access auditing, etc. to help ensure the stability of service.</p>
<h1 id="periodic-inspection">Periodic inspection</h1>
<ul>
<li>Availability: Availability remains at 100% normally. Occasionally, in the event of node failures or other anomalies, availability may fall below 100%</li>
<li>IOPS: The sudden increase in IOPS may affect service stability, while the sudden decrease in traffic may be caused by service issues</li>
<li>Read and write latency: The P99 latency spikes of read and/or write operations may affect Pegasus users</li>
<li>System resources usage: Whether CPU, memory, disk usage, network bandwidth and connection count have skyrocketed, and whether they have reached the high water mark</li>
</ul>
<h1 id="monitoring-and-alarms">Monitoring and alarms</h1>
<p>Refer to <a href="/administration/monitoring">Monitoring</a>.</p>
<h1 id="troubleshooting">Troubleshooting</h1>
<p>Use the <a href="/overview/shell">Shell tools</a> to check the status of Pegasus:</p>
<ul>
<li>Whether the basic information of the cluster is normal:
<ul>
<li>Whether the <code class="language-plaintext highlighter-rouge">meta_servers</code> list is normal</li>
<li>Whether the value of <code class="language-plaintext highlighter-rouge">meta_function_level</code> is <code class="language-plaintext highlighter-rouge">steady</code></li>
</ul>
</li>
<li>Whether each table and each partition is health:<code class="language-plaintext highlighter-rouge">ls -d</code>
<ul>
<li>Whether the count of table count is correct</li>
<li>Whether the number of all table’s <code class="language-plaintext highlighter-rouge">unhealthy</code> partition count is 0</li>
</ul>
</li>
<li>Whether each server is health: <code class="language-plaintext highlighter-rouge">nodes -d</code>
<ul>
<li>All servers are in the list and their status is <code class="language-plaintext highlighter-rouge">ALIVE</code></li>
<li>Is the data distribution severely skewed (i.e. the number of <code class="language-plaintext highlighter-rouge">replica_count</code> or <code class="language-plaintext highlighter-rouge">primary_count</code> columns in the list is imbalance). If severely skewed, it’s recommended to use the shell tool command <code class="language-plaintext highlighter-rouge">set_meta_level</code> to set it to <code class="language-plaintext highlighter-rouge">lively</code> in a time window with relatively low traffic, then load balancing performed. Remember to reset it to <code class="language-plaintext highlighter-rouge">steady</code> state when they are balanced.
<blockquote>
<p>Note: For latency sensitive users, load balancing can only be performed when necessary and should not affect service stability. During the process, the cluster status should be closely observed</p>
</blockquote>
</li>
</ul>
</li>
<li>Whether the basic information of each server is normal: <code class="language-plaintext highlighter-rouge">server_info</code>
<ul>
<li>Whether each server version is correct</li>
<li>Determine whether a restart has occurred through each server’s <em>start time</em></li>
</ul>
</li>
<li>Whether the metrics of each server is normal: <code class="language-plaintext highlighter-rouge">server_stat</code>
<ul>
<li>IOPS and latency</li>
<li>Memory usage</li>
</ul>
</li>
<li>Whether the metrics of each table is normal: <code class="language-plaintext highlighter-rouge">app_stat</code>
<ul>
<li>IOPS</li>
<li>Disk usage</li>
</ul>
</li>
</ul>
<p>Check the system information:
For example, check the count of socket connections on the server (where <code class="language-plaintext highlighter-rouge">34601</code> is the service listening port of Meta Server):</p>
<ul>
<li>Use the <code class="language-plaintext highlighter-rouge">netstat</code> command on the server where the Meta Server is deployed to check the count of connections:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>netstat <span class="nt">-na</span> | <span class="nb">grep</span> <span class="s1">'34601\&gt;'</span> | <span class="nb">grep </span>ESTABLISHED | <span class="nb">wc</span> <span class="nt">-l</span>
</code></pre></div> </div>
</li>
<li>Check the remote nodes that have established a connection with the server, sorted by the count of connections:
<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code>netstat <span class="nt">-na</span> | <span class="nb">grep</span> <span class="s1">'34601\&gt;'</span> | <span class="nb">grep </span>ESTABLISHE | <span class="nb">awk</span> <span class="s1">'{print $5}'</span> | <span class="nb">sed</span> <span class="s1">'s/:.*//'</span> | <span class="nb">sort</span> | <span class="nb">uniq</span> <span class="nt">-c</span> | <span class="nb">sort</span> <span class="nt">-k1</span> <span class="nt">-n</span> <span class="nt">-r</span> | <span class="nb">head</span>
</code></pre></div> </div>
</li>
<li>If there are too many connections (for example, if the count of a single node connections exceeds 100), further analysis is needed to determine the cause.</li>
</ul>
<h2 id="common-troubleshooting-methods">Common troubleshooting methods</h2>
<ul>
<li>If the service process exits abnormally, it is necessary to log in to the corresponding server to check the reason:
<ul>
<li>Check to abnormal exit reason via <code class="language-plaintext highlighter-rouge">dmesg</code> or <code class="language-plaintext highlighter-rouge">/var/log/messages</code></li>
<li>If it’s <code class="language-plaintext highlighter-rouge">Out of memory: Killed process xxx</code>: Check the memory usage monitoring of Meta Server or Replica Server and analyze for any abnormal issues</li>
<li>If it’s <code class="language-plaintext highlighter-rouge">segfault at xxx</code>:
<ul>
<li>Check the standard error output logs and server logs of Meta Server or Replica Server</li>
<li>Check if there is a coredump file generated, and use <code class="language-plaintext highlighter-rouge">gdb</code> for analysis if there is. If there is no coredump file, set the system and user’s <code class="language-plaintext highlighter-rouge">ulimit</code> as needed.</li>
</ul>
</li>
</ul>
</li>
<li>If there are many faulty servers, consider to use the <code class="language-plaintext highlighter-rouge">set_meta_level</code> command to set it to <code class="language-plaintext highlighter-rouge">freezed</code> state to avoid service avalanches</li>
<li>If the process keeps restarting (abnormally exiting and being restarted by other process monitoring services), consider temporarily stopping the process monitoring service to automatically restart the Pegasus process</li>
<li>If remote login (such as <code class="language-plaintext highlighter-rouge">ssh</code>) to the server is not available, it is possible that the physical server has shutdown. Please contact the service provider for assistance</li>
</ul>
<h1 id="audit-when-user-apply-pegasus-service">Audit when user apply Pegasus service</h1>
<p>Pegasus, like most databases, manage resources in the unit of <em>table</em>. As Pegasus administrators, when user apply Pegasus table, it is necessary to understand the resources required by the table in order to allocate appropriate computing and storage resources. Consider Pegasus storage principles and optimizing the key-value schema design can also help ensure service stability.</p>
<p>The following information can be collected and analyzed:</p>
<ul>
<li>Table name</li>
<li>Read operation peak (QPS)</li>
<li>Total number of reads operations (operations/day)</li>
<li>Write operation peak (TPS)</li>
<li>Total number of reads operations (operations/day)</li>
<li>Key-value design schema (to determine if there is a data skew issue)</li>
<li>Read/write mode (to determine if there are read or write hotspot issues)</li>
<li>Average size of each key-value (KB)</li>
<li>Estimated total data usage (GB)</li>
<li>Growth estimate (e.g. 6 months/1 year/3 years of growth)</li>
<li>Read operation latency required (P99 latency)</li>
<li>Write operation latency required (P99 latency)</li>
<li>IOPS characteristic (such as all-day equilibrium, smooth with peaks and valleys, timed batch writes, etc.)</li>
</ul>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content is-small has-text-centered">
<div style="margin-bottom: 20px;">
<a href="http://incubator.apache.org">
<img src="/assets/images/egg-logo.png"
width="15%"
alt="Apache Incubator"/>
</a>
</div>
Copyright &copy; 2023 <a href="http://www.apache.org">The Apache Software Foundation</a>.
Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version
2.0</a>.
<br><br>
Apache Pegasus is an effort undergoing incubation at The Apache Software Foundation (ASF),
sponsored by the Apache Incubator. Incubation is required of all newly accepted projects
until a further review indicates that the infrastructure, communications, and decision making process
have stabilized in a manner consistent with other successful ASF projects. While incubation status is
not necessarily a reflection of the completeness or stability of the code, it does indicate that the
project has yet to be fully endorsed by the ASF.
<br><br>
Apache Pegasus, Pegasus, Apache, the Apache feather logo, and the Apache Pegasus project logo are either
registered trademarks or trademarks of The Apache Software Foundation in the United States and other
countries.
</div>
</div>
</footer>
</div>
<!-- right panel -->
<div class="dashboard-panel is-small is-scrollable is-hidden-mobile">
<p class="menu-label">
<span class="icon">
<i class="fa fa-bars" aria-hidden="true"></i>
</span>
Table of contents
</p>
<ul class="menu-list">
<li><a href="#periodic-inspection">Periodic inspection</a></li>
<li><a href="#monitoring-and-alarms">Monitoring and alarms</a></li>
<li><a href="#troubleshooting">Troubleshooting</a>
<ul>
<li><a href="#common-troubleshooting-methods">Common troubleshooting methods</a></li>
</ul>
</li>
<li><a href="#audit-when-user-apply-pegasus-service">Audit when user apply Pegasus service</a></li>
</ul>
</div>
</div>
<script src="/assets/js/app.js" type="text/javascript"></script>
<script>
docsearch({
container: '#docsearch',
appId: 'QRN30RBW0S',
indexName: 'pegasus-apache',
apiKey: 'd3a3252fa344359766707a106c4ed88f',
debug: true
});
</script>
</body>
</html>