blob: f859fb7085dc91f8d0b56bc2f29d711abb69b039 [file] [log] [blame]
import{_ as s,r as i,o as r,c as u,b as t,d,a as l,t as o,e as n}from"./app-Bx8hKGcu.js";const c={},h=n('<p>在 IoTDB 的运行过程中,我们希望对 IoTDB 的状态进行观测,以便于排查系统问题或者及时发现系统潜在的风险,能够**反映系统运行状态的一系列指标<br> **就是系统监控指标。</p><h2 id="_1-什么场景下会使用到监控" tabindex="-1"><a class="header-anchor" href="#_1-什么场景下会使用到监控"><span>1. 什么场景下会使用到监控?</span></a></h2><p>那么什么时候会用到监控框架呢?下面列举一些常见的场景。</p><ol><li><p>系统变慢了</p><p>系统变慢几乎是最常见也最头疼的问题,这时候我们需要尽可能多的信息来帮助我们找到系统变慢的原因,比如:</p><ul><li>JVM信息:是不是有FGC?GC耗时多少?GC后内存有没有恢复?是不是有大量的线程?</li><li>系统信息:CPU使用率是不是太高了?磁盘IO是不是很频繁?</li><li>连接数:当前连接是不是太多?</li><li>接口:当前TPS是多少?各个接口耗时有没有变化?</li><li>线程池:系统中各种任务是否有积压?</li><li>缓存命中率</li></ul></li><li><p>磁盘快满了</p><p>这时候我们迫切想知道最近一段时间数据文件的增长情况,看看是不是某种文件有突增。</p></li><li><p>系统运行是否正常</p><p>此时我们可能需要通过错误日志的数量、集群节点的状态等指标来判断系统是否在正常运行。</p></li></ol><h2 id="_2-什么人需要使用监控" tabindex="-1"><a class="header-anchor" href="#_2-什么人需要使用监控"><span>2. 什么人需要使用监控?</span></a></h2><p>所有关注系统状态的人员都可以使用,包括但不限于研发、测试、运维、DBA等等</p><h2 id="_3-什么是监控指标" tabindex="-1"><a class="header-anchor" href="#_3-什么是监控指标"><span>3. 什么是监控指标?</span></a></h2><h3 id="_3-1-监控指标名词解释" tabindex="-1"><a class="header-anchor" href="#_3-1-监控指标名词解释"><span>3.1. 监控指标名词解释</span></a></h3><p>在 IoTDB 的监控模块,每个监控指标被 <code>Metric Name</code> 和 <code>Tags</code> 唯一标识。</p><ul><li><code>Metric Name</code>:指标类型名称,比如<code>logback_events</code>表示日志事件。</li><li><code>Tags</code>:指标分类,形式为Key-Value对,每个指标下面可以有0到多个分类,常见的Key-Value对: <ul><li><code>name = xxx</code>:被监控对象的名称,是对<strong>业务逻辑</strong>的说明。比如对于<code>Metric Name = entry_seconds_count</code><br> 类型的监控项,name的含义是指被监控的业务接口。</li><li><code>type = xxx</code>:监控指标类型细分,是对<strong>监控指标</strong>本身的说明。比如对于<code>Metric Name = point</code><br> 类型的监控项,type的含义是指监控具体是什么类型的点数。</li><li><code>status = xxx</code>:被监控对象的状态,是对<strong>业务逻辑</strong>的说明。比如对于<code>Metric Name = Task</code>类型的监控项可以通过该参数,从而区分被监控对象的状态。</li><li><code>user = xxx</code>:被监控对象的相关用户,是对<strong>业务逻辑</strong>的说明。比如统计<code>root</code>用户的写入总点数。</li><li>根据具体情况自定义:比如logback_events_total下有一个level的分类,用来表示特定级别下的日志数量。</li></ul></li><li><code>Metric Level</code>:<strong>指标管理级别</strong>,默认启动级别为<code>Core</code>级别,建议启动级别为<code>Important级别</code><br> ,审核严格程度<code>Core &gt; Important &gt; Normal &gt; All</code><ul><li><code>Core</code>:系统的核心指标,供<strong>系统内核和运维人员</strong>使用,关乎系统的<strong>性能、稳定性、安全性</strong>,比如实例的状况,系统的负载等。</li><li><code>Important</code>:模块的重要指标,供<strong>运维和测试人员</strong>使用,直接关乎<strong>每个模块的运行状态</strong>,比如合并文件个数、执行情况等。</li><li><code>Normal</code>:模块的一般指标,供<strong>开发人员</strong>使用,方便在出现问题时<strong>定位模块</strong>,比如合并中的特定关键操作情况。</li><li><code>All</code>:模块的全部指标,供<strong>模块开发人员</strong>使用,往往在复现问题的时候使用,从而快速解决问题。</li></ul></li></ul><h3 id="_3-2-监控指标对外获取数据格式" tabindex="-1"><a class="header-anchor" href="#_3-2-监控指标对外获取数据格式"><span>3.2. 监控指标对外获取数据格式</span></a></h3><ul><li>IoTDB 对外提供 JMX、 Prometheus 和 IoTDB 格式的监控指标: <ul><li>对于 JMX ,可以通过<code>org.apache.iotdb.metrics</code>获取系统监控指标指标。</li><li>对于 Prometheus ,可以通过对外暴露的端口获取监控指标的值</li><li>对于 IoTDB 方式对外暴露:可以通过执行 IoTDB 的查询来获取监控指标</li></ul></li></ul><h2 id="_4-监控指标有哪些" tabindex="-1"><a class="header-anchor" href="#_4-监控指标有哪些"><span>4. 监控指标有哪些?</span></a></h2>',13),m={href:"https://github.com/apache/iotdb/tree/master/metrics",target:"_blank",rel:"noopener noreferrer"},_=t("h3",{id:"_4-1-core-级别监控指标",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-1-core-级别监控指标"},[t("span",null,"4.1. Core 级别监控指标")])],-1),p=t("p",null,"Core 级别的监控指标在系统运行中默认开启,每一个 Core 级别的监控指标的添加都需要经过谨慎的评估,目前 Core 级别的监控指标如下所述:",-1),g=t("h4",{id:"_4-1-1-集群运行状态",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-1-1-集群运行状态"},[t("span",null,"4.1.1. 集群运行状态")])],-1),b=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),q=t("tr",null,[t("td",null,"config_node"),t("td",null,'name="total",status="Registered/Online/Unknown"'),t("td",null,"AutoGauge"),t("td",null,"已注册/在线/离线 confignode 的节点数量")],-1),f=t("tr",null,[t("td",null,"data_node"),t("td",null,'name="total",status="Registered/Online/Unknown"'),t("td",null,"AutoGauge"),t("td",null,"已注册/在线/离线 datanode 的节点数量")],-1),y=t("td",null,"points",-1),T=t("td",null,"Gauge",-1),D=t("td",null,"最新一个刷盘的memtale的点数",-1),v=n('<h4 id="_4-1-2-iotdb-进程运行状态" tabindex="-1"><a class="header-anchor" href="#_4-1-2-iotdb-进程运行状态"><span>4.1.2. IoTDB 进程运行状态</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>process_cpu_load</td><td>name=&quot;process&quot;</td><td>AutoGauge</td><td>IoTDB 进程的 CPU 占用率,单位为%</td></tr><tr><td>process_cpu_time</td><td>name=&quot;process&quot;</td><td>AutoGauge</td><td>IoTDB 进程占用的 CPU 时间,单位为ns</td></tr><tr><td>process_max_mem</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>IoTDB 进程最大可用内存</td></tr><tr><td>process_total_mem</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>IoTDB 进程当前已申请内存</td></tr><tr><td>process_free_mem</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>IoTDB 进程当前剩余可用内存</td></tr></tbody></table><h4 id="_4-1-3-系统运行状态" tabindex="-1"><a class="header-anchor" href="#_4-1-3-系统运行状态"><span>4.1.3. 系统运行状态</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>sys_cpu_load</td><td>name=&quot;system&quot;</td><td>AutoGauge</td><td>系统的 CPU 占用率,单位为%</td></tr><tr><td>sys_cpu_cores</td><td>name=&quot;system&quot;</td><td>Gauge</td><td>系统的可用处理器数</td></tr><tr><td>sys_total_physical_memory_size</td><td>name=&quot;memory&quot;</td><td>Gauge</td><td>系统的最大物理内存</td></tr><tr><td>sys_free_physical_memory_size</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>系统的剩余可用内存</td></tr><tr><td>sys_total_swap_space_size</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>系统的交换区最大空间</td></tr><tr><td>sys_free_swap_space_size</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>系统的交换区剩余可用空间</td></tr><tr><td>sys_committed_vm_size</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>系统保证可用于正在运行的进程的虚拟内存量</td></tr><tr><td>sys_disk_total_space</td><td>name=&quot;disk&quot;</td><td>AutoGauge</td><td>系统磁盘总大小</td></tr><tr><td>sys_disk_free_space</td><td>name=&quot;disk&quot;</td><td>AutoGauge</td><td>系统磁盘可用大小</td></tr></tbody></table><h3 id="_4-2-important-级别监控指标" tabindex="-1"><a class="header-anchor" href="#_4-2-important-级别监控指标"><span>4.2. Important 级别监控指标</span></a></h3><p>目前 Important 级别的监控指标如下所述:</p><h4 id="_4-2-1-集群运行状态" tabindex="-1"><a class="header-anchor" href="#_4-2-1-集群运行状态"><span>4.2.1. 集群运行状态</span></a></h4>',7),k=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),C=t("td",null,"cluster_node_leader_count",-1),G=t("td",null,"Gauge",-1),V=t("td",null,"节点上共识组Leader的数量",-1),I=t("td",null,"cluster_node_status",-1),M=t("td",null,"Gauge",-1),A=t("td",null,"节点的状态,0=Unkonwn 1=online",-1),x=t("h4",{id:"_4-2-2-节点统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-2-节点统计"},[t("span",null,"4.2.2. 节点统计")])],-1),P=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),B=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="database"'),t("td",null,"AutoGauge"),t("td",null,"系统数据库数量")],-1),K=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="timeSeries"'),t("td",null,"AutoGauge"),t("td",null,"系统时间序列数量")],-1),S=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="pointsIn"'),t("td",null,"Counter"),t("td",null,"系统累计写入点数")],-1),N=t("tr",null,[t("td",null,"region"),t("td",null,'name="total",type="SchemaRegion"'),t("td",null,"AutoGauge"),t("td",null,"分区表中 SchemaRegion 总数量")],-1),R=t("tr",null,[t("td",null,"region"),t("td",null,'name="total",type="DataRegion"'),t("td",null,"AutoGauge"),t("td",null,"分区表中 DataRegion 总数量")],-1),j=t("td",null,"region",-1),U=t("td",null,"Gauge",-1),w=t("td",null,"分区表中对应节点上 DataRegion 总数量",-1),O=t("td",null,"region",-1),z=t("td",null,"Gauge",-1),L=t("td",null,"分区表中对应节点上 DataRegion 总数量",-1),H=t("h4",{id:"_4-2-3-iot共识协议统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-3-iot共识协议统计"},[t("span",null,"4.2.3. IoT共识协议统计")])],-1),J=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),F=t("td",null,"iot_consensus",-1),E=t("td",null,"AutoGauge",-1),X=t("td",null,"副本组同步线程的当前同步进度",-1),W=t("td",null,"iot_consensus",-1),Q=t("td",null,"AutoGauge",-1),Y=t("td",null,"副本组同步线程缓存队列请求总大小",-1),Z=t("td",null,"iot_consensus",-1),$=t("td",null,"AutoGauge",-1),tt=t("td",null,"副本组主流程写入进度",-1),et=t("td",null,"iot_consensus",-1),ot=t("td",null,"AutoGauge",-1),dt=t("td",null,"副本组同步进度",-1),nt=t("td",null,"stage",-1),at=t("td",null,"Histogram",-1),lt=t("td",null,"主流程获取状态机锁耗时",-1),st=t("td",null,"stage",-1),it=t("td",null,"Histogram",-1),rt=t("td",null,"主流程写入状态机检查耗时",-1),ut=t("td",null,"stage",-1),ct=t("td",null,"Histogram",-1),ht=t("td",null,"主流程写入状态机耗时",-1),mt=t("td",null,"stage",-1),_t=t("td",null,"Histogram",-1),pt=t("td",null,"主流程尝试添加队列耗时",-1),gt=t("td",null,"stage",-1),bt=t("td",null,"Histogram",-1),qt=t("td",null,"主流程全写入耗时",-1),ft=t("td",null,"stage",-1),yt=t("td",null,"Histogram",-1),Tt=t("td",null,"同步线程构造 Batch 耗时",-1),Dt=t("td",null,"stage",-1),vt=t("td",null,"Histogram",-1),kt=t("td",null,"异步回调流程同步日志耗时",-1),Ct=n('<h4 id="_4-2-4-缓存统计" tabindex="-1"><a class="header-anchor" href="#_4-2-4-缓存统计"><span>4.2.4. 缓存统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>cache_hit</td><td>name=&quot;chunk&quot;</td><td>AutoGauge</td><td>ChunkCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name=&quot;schema&quot;</td><td>AutoGauge</td><td>SchemaCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name=&quot;timeSeriesMeta&quot;</td><td>AutoGauge</td><td>TimeseriesMetadataCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name=&quot;bloomFilter&quot;</td><td>AutoGauge</td><td>TimeseriesMetadataCache中的bloomFilter的拦截率,单位为%</td></tr><tr><td>cache</td><td>name=&quot;Database&quot;, type=&quot;hit&quot;</td><td>Counter</td><td>Database Cache 的命中次数</td></tr><tr><td>cache</td><td>name=&quot;Database&quot;, type=&quot;all&quot;</td><td>Counter</td><td>Database Cache 的访问次数</td></tr><tr><td>cache</td><td>name=&quot;SchemaPartition&quot;, type=&quot;hit&quot;</td><td>Counter</td><td>SchemaPartition Cache 的命中次数</td></tr><tr><td>cache</td><td>name=&quot;SchemaPartition&quot;, type=&quot;all&quot;</td><td>Counter</td><td>SchemaPartition Cache 的访问次数</td></tr><tr><td>cache</td><td>name=&quot;DataPartition&quot;, type=&quot;hit&quot;</td><td>Counter</td><td>DataPartition Cache 的命中次数</td></tr><tr><td>cache</td><td>name=&quot;DataPartition&quot;, type=&quot;all&quot;</td><td>Counter</td><td>DataPartition Cache 的访问次数</td></tr></tbody></table><h4 id="_4-2-5-接口层统计" tabindex="-1"><a class="header-anchor" href="#_4-2-5-接口层统计"><span>4.2.5. 接口层统计</span></a></h4>',3),Gt=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),Vt=t("td",null,"operation",-1),It=t("td",null,"Histogram",-1),Mt=t("td",null,"客户端执行的操作的耗时情况",-1),At=t("td",null,"entry",-1),xt=t("td",null,"Timer",-1),Pt=t("td",null,"Client 建立的 Thrift 的耗时情况",-1),Bt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="ConfigNodeRPC"'),t("td",null,"AutoGauge"),t("td",null,"ConfigNode 的内部 Thrift 连接数")],-1),Kt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="Internal"'),t("td",null,"AutoGauge"),t("td",null,"DataNode 的内部 Thrift 连接数")],-1),St=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="MPPDataExchange"'),t("td",null,"AutoGauge"),t("td",null,"MPP 框架的内部 Thrift 连接数")],-1),Nt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="RPC"'),t("td",null,"AutoGauge"),t("td",null,"Client 建立的 Thrift 连接数")],-1),Rt=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="ConfigNodeRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"ConfigNode 的内部活跃 Thrift 连接数")],-1),jt=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="DataNodeInternalRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"DataNode 的内部活跃 Thrift 连接数")],-1),Ut=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="MPPDataExchangeRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"MPP 框架的内部活跃 Thrift 连接数")],-1),wt=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="ClientRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"Client 建立的活跃 Thrift 连接数")],-1),Ot=t("h4",{id:"_4-2-6-内存统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-6-内存统计"},[t("span",null,"4.2.6. 内存统计")])],-1),zt=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),Lt=t("td",null,"mem",-1),Ht=t("td",null,"AutoGauge",-1),Jt=t("td",null,"DataNode内对应DataRegion的内存占用,单位为byte",-1),Ft=t("td",null,"mem",-1),Et=t("td",null,"AutoGauge",-1),Xt=t("td",null,"写入TsFile时的ChunkMetaData的内存占用,单位为byte",-1),Wt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="IoTConsensus"'),t("td",null,"AutoGauge"),t("td",null,"IoT共识协议的内存占用,单位为byte")],-1),Qt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="schema_region_total_usage"'),t("td",null,"AutoGauge"),t("td",null,"所有SchemaRegion的总内存占用,单位为byte")],-1),Yt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="schema_region_total_remaining"'),t("td",null,"AutoGauge"),t("td",null,"所有SchemaRegion的总内存剩余,单位为byte")],-1),Zt=n('<h4 id="_4-2-7-任务统计" tabindex="-1"><a class="header-anchor" href="#_4-2-7-任务统计"><span>4.2.7. 任务统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>queue</td><td>name=&quot;compaction_inner&quot;, status=&quot;running/waiting&quot;</td><td>Gauge</td><td>空间内合并任务数</td></tr><tr><td>queue</td><td>name=&quot;compaction_cross&quot;, status=&quot;running/waiting&quot;</td><td>Gauge</td><td>跨空间合并任务数</td></tr><tr><td>cost_task</td><td>name=&quot;inner_compaction/cross_compaction/flush&quot;</td><td>Gauge</td><td>任务耗时情况</td></tr><tr><td>queue</td><td>name=&quot;flush&quot;,status=&quot;running/waiting&quot;</td><td>AutoGauge</td><td>刷盘任务数</td></tr><tr><td>queue</td><td>name=&quot;Sub_RawQuery&quot;,status=&quot;running/waiting&quot;</td><td>AutoGauge</td><td>Sub_RawQuery任务数</td></tr></tbody></table><h4 id="_4-2-8-合并统计" tabindex="-1"><a class="header-anchor" href="#_4-2-8-合并统计"><span>4.2.8. 合并统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>data_written</td><td>name=&quot;compaction&quot;, type=&quot;aligned/not-aligned/total&quot;</td><td>Counter</td><td>合并时写入量</td></tr><tr><td>data_read</td><td>name=&quot;compaction&quot;</td><td>Counter</td><td>合并时的读取量</td></tr><tr><td>compaction_task_count</td><td>name = &quot;inner_compaction&quot;, type=&quot;sequence&quot;</td><td>Counter</td><td>顺序空间内合并次数</td></tr><tr><td>compaction_task_count</td><td>name = &quot;inner_compaction&quot;, type=&quot;unsequence&quot;</td><td>Counter</td><td>乱序空间内合并次数</td></tr><tr><td>compaction_task_count</td><td>name = &quot;cross_compaction&quot;, type=&quot;cross&quot;</td><td>Counter</td><td>跨空间合并次数</td></tr></tbody></table><h4 id="_4-2-9-文件统计信息" tabindex="-1"><a class="header-anchor" href="#_4-2-9-文件统计信息"><span>4.2.9. 文件统计信息</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>file_size</td><td>name=&quot;wal&quot;</td><td>AutoGauge</td><td>写前日志总大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;seq&quot;</td><td>AutoGauge</td><td>顺序TsFile总大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;unseq&quot;</td><td>AutoGauge</td><td>乱序TsFile总大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;inner-seq-temp&quot;</td><td>AutoGauge</td><td>顺序空间内合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;inner-unseq-temp&quot;</td><td>AutoGauge</td><td>乱序空间内合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;cross-temp&quot;</td><td>AutoGauge</td><td>跨空间合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name=&quot;mods&quot;</td><td>AutoGauge</td><td>Modification 文件的大小</td></tr><tr><td>file_count</td><td>name=&quot;wal&quot;</td><td>AutoGauge</td><td>写前日志文件个数</td></tr><tr><td>file_count</td><td>name=&quot;seq&quot;</td><td>AutoGauge</td><td>顺序TsFile文件个数</td></tr><tr><td>file_count</td><td>name=&quot;unseq&quot;</td><td>AutoGauge</td><td>乱序TsFile文件个数</td></tr><tr><td>file_count</td><td>name=&quot;inner-seq-temp&quot;</td><td>AutoGauge</td><td>顺序空间内合并临时文件个数</td></tr><tr><td>file_count</td><td>name=&quot;inner-unseq-temp&quot;</td><td>AutoGauge</td><td>乱序空间内合并临时文件个数</td></tr><tr><td>file_count</td><td>name=&quot;cross-temp&quot;</td><td>AutoGauge</td><td>跨空间合并临时文件个数</td></tr><tr><td>file_count</td><td>name=&quot;open_file_handlers&quot;</td><td>AutoGauge</td><td>IoTDB 进程打开文件数,仅支持Linux和MacOS</td></tr><tr><td>file_count</td><td>name=&quot;mods</td><td>AutoGauge</td><td>Modification 文件的数目</td></tr></tbody></table><h4 id="_4-2-10-iotdb-进程统计" tabindex="-1"><a class="header-anchor" href="#_4-2-10-iotdb-进程统计"><span>4.2.10. IoTDB 进程统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>process_used_mem</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>IoTDB 进程当前使用内存</td></tr><tr><td>process_mem_ratio</td><td>name=&quot;memory&quot;</td><td>AutoGauge</td><td>IoTDB 进程的内存占用比例</td></tr><tr><td>process_threads_count</td><td>name=&quot;process&quot;</td><td>AutoGauge</td><td>IoTDB 进程当前线程数</td></tr><tr><td>process_status</td><td>name=&quot;process&quot;</td><td>AutoGauge</td><td>IoTDB 进程存活状态,1为存活,0为终止</td></tr></tbody></table><h4 id="_4-2-11-iotdb-日志统计" tabindex="-1"><a class="header-anchor" href="#_4-2-11-iotdb-日志统计"><span>4.2.11. IoTDB 日志统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>logback_events</td><td>level=&quot;trace/debug/info/warn/error&quot;</td><td>Counter</td><td>不同类型的日志个数</td></tr></tbody></table><h4 id="_4-2-12-jvm-线程统计" tabindex="-1"><a class="header-anchor" href="#_4-2-12-jvm-线程统计"><span>4.2.12. JVM 线程统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>jvm_threads_live_threads</td><td></td><td>AutoGauge</td><td>当前线程数</td></tr><tr><td>jvm_threads_daemon_threads</td><td></td><td>AutoGauge</td><td>当前 Daemon 线程数</td></tr><tr><td>jvm_threads_peak_threads</td><td></td><td>AutoGauge</td><td>峰值线程数</td></tr><tr><td>jvm_threads_states_threads</td><td>state=&quot;runnable/blocked/waiting/timed-waiting/new/terminated&quot;</td><td>AutoGauge</td><td>当前处于各种状态的线程数</td></tr></tbody></table><h4 id="_4-2-13-jvm-gc-统计" tabindex="-1"><a class="header-anchor" href="#_4-2-13-jvm-gc-统计"><span>4.2.13. JVM GC 统计</span></a></h4>',13),$t=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),te=t("tr",null,[t("td",null,"jvm_gc_pause"),t("td",null,'action="end of major GC/end of minor GC",cause="xxxx"'),t("td",null,"Timer"),t("td",null,"不同原因的Young GC/Full GC的次数与耗时")],-1),ee=t("tr",null,[t("td"),t("td"),t("td"),t("td")],-1),oe=t("td",null,"jvm_gc_concurrent_phase_time",-1),de=t("td",null,"Timer",-1),ne=t("td",null,"不同原因的Young GC/Full GC的次数与耗时",-1),ae=t("tr",null,[t("td"),t("td"),t("td"),t("td")],-1),le=t("tr",null,[t("td",null,"jvm_gc_max_data_size_bytes"),t("td"),t("td",null,"AutoGauge"),t("td",null,"老年代内存的历史最大值")],-1),se=t("tr",null,[t("td",null,"jvm_gc_live_data_size_bytes"),t("td"),t("td",null,"AutoGauge"),t("td",null,"老年代内存的使用值")],-1),ie=t("tr",null,[t("td",null,"jvm_gc_memory_promoted_bytes"),t("td"),t("td",null,"Counter"),t("td",null,"老年代内存正向增长累计值")],-1),re=t("tr",null,[t("td",null,"jvm_gc_memory_allocated_bytes"),t("td"),t("td",null,"Counter"),t("td",null,"GC分配内存正向增长累计值")],-1),ue=t("h4",{id:"_4-2-14-jvm-内存统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-14-jvm-内存统计"},[t("span",null,"4.2.14. JVM 内存统计")])],-1),ce=t("table",null,[t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])]),t("tbody",null,[t("tr",null,[t("td",null,"jvm_buffer_memory_used_bytes"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"已经使用的缓冲区大小")]),t("tr",null,[t("td",null,"jvm_buffer_total_capacity_bytes"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"最大缓冲区大小")]),t("tr",null,[t("td",null,"jvm_buffer_count_buffers"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"当前缓冲区数量")]),t("tr",null,[t("td",null,"jvm_memory_committed_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"当前申请的内存大小")]),t("tr",null,[t("td",null,"jvm_memory_max_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"最大内存")]),t("tr",null,[t("td",null,"jvm_memory_used_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"已使用内存大小")])])],-1),he=n('<h4 id="_4-2-15-jvm-类加载统计" tabindex="-1"><a class="header-anchor" href="#_4-2-15-jvm-类加载统计"><span>4.2.15. JVM 类加载统计</span></a></h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>jvm_classes_unloaded_classes</td><td></td><td>AutoGauge</td><td>累计卸载的class数量</td></tr><tr><td>jvm_classes_loaded_classes</td><td></td><td>AutoGauge</td><td>累计加载的class数量</td></tr></tbody></table><h4 id="_4-2-16-jvm-编译时间统计" tabindex="-1"><a class="header-anchor" href="#_4-2-16-jvm-编译时间统计"><span>4.2.16. JVM 编译时间统计</span></a></h4>',3),me=t("table",null,[t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])]),t("tbody",null,[t("tr",null,[t("td",null,"jvm_compilation_time_ms"),t("td",{compiler:"HotSpot 64-Bit Tiered Compilers,"}),t("td",null,"AutoGauge"),t("td",null,"耗费在编译上的时间")])])],-1),_e=t("h3",{id:"_4-3-normal-级别监控指标",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-3-normal-级别监控指标"},[t("span",null,"4.3. Normal 级别监控指标")])],-1),pe=t("h4",{id:"_4-3-1-集群",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-3-1-集群"},[t("span",null,"4.3.1. 集群")])],-1),ge=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),be=t("td",null,"region",-1),qe=t("td",null,"AutoGauge",-1),fe=t("td",null,"特定节点上不同 Database 的 DataRegion/SchemaRegion 个数",-1),ye=t("td",null,"slot",-1),Te=t("td",null,"AutoGauge",-1),De=t("td",null,"特定节点上不同 Database 的 DataSlot/SchemaSlot 个数",-1),ve=n(`<h3 id="_4-4-all-级别监控指标" tabindex="-1"><a class="header-anchor" href="#_4-4-all-级别监控指标"><span>4.4. All 级别监控指标</span></a></h3><p>目前还没有All级别的监控指标,后续会持续添加。</p><h2 id="_5-怎样获取这些系统监控" tabindex="-1"><a class="header-anchor" href="#_5-怎样获取这些系统监控"><span>5. 怎样获取这些系统监控?</span></a></h2><ul><li>监控模块的相关配置均在<code>conf/iotdb-{datanode/confignode}.properties</code>中,所有配置项支持通过<code>load configuration</code>命令热加载。</li></ul><h3 id="_5-1-使用-jmx-方式" tabindex="-1"><a class="header-anchor" href="#_5-1-使用-jmx-方式"><span>5.1. 使用 JMX 方式</span></a></h3><p>对于使用 JMX 对外暴露的指标,可以通过 Jconsole 来进行查看。在进入 Jconsole 监控页面后,首先会看到 IoTDB<br> 的各类运行情况的概览。在这里,您可以看到堆内存信息、线程信息、类信息以及服务器的 CPU 使用情况。</p><h4 id="_5-1-1-获取监控指标数据" tabindex="-1"><a class="header-anchor" href="#_5-1-1-获取监控指标数据"><span>5.1.1. 获取监控指标数据</span></a></h4><p>连接到 JMX 后,您可以通过 &quot;MBeans&quot; 标签找到名为 &quot;org.apache.iotdb.metrics&quot; 的 &quot;MBean&quot;,可以在侧边栏中查看所有监控指标的具体值。</p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" alt="metric-jmx" src="https://alioss.timecho.com/docs/img/github/204018765-6fda9391-ebcf-4c80-98c5-26f34bd74df0.png"><h4 id="_5-1-2-获取其他相关数据" tabindex="-1"><a class="header-anchor" href="#_5-1-2-获取其他相关数据"><span>5.1.2. 获取其他相关数据</span></a></h4><p>连接到 JMX 后,您可以通过 &quot;MBeans&quot; 标签找到名为 &quot;org.apache.iotdb.service&quot; 的 &quot;MBean&quot;,如下图所示,了解服务的基本状态</p><p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" src="https://alioss.timecho.com/docs/img/github/149951720-707f1ee8-32ee-4fde-9252-048caebd232e.png"> <br></p><p>为了提高查询性能,IOTDB 对 ChunkMetaData 和 TsFileMetaData 进行了缓存。用户可以使用 MXBean<br> ,展开侧边栏<code>org.apache.iotdb.db.service</code>查看缓存命中率:</p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" src="https://alioss.timecho.com/docs/img/github/112426760-73e3da80-8d73-11eb-9a8f-9232d1f2033b.png"><h3 id="_5-2-使用-prometheus-方式" tabindex="-1"><a class="header-anchor" href="#_5-2-使用-prometheus-方式"><span>5.2. 使用 Prometheus 方式</span></a></h3><h4 id="_5-2-1-监控指标的-prometheus-映射关系" tabindex="-1"><a class="header-anchor" href="#_5-2-1-监控指标的-prometheus-映射关系"><span>5.2.1. 监控指标的 Prometheus 映射关系</span></a></h4><blockquote><p>对于 Metric Name 为 name, Tags 为 K1=V1, ..., Kn=Vn 的监控指标有如下映射,其中 value 为具体值</p></blockquote><table><thead><tr><th>监控指标类型</th><th>映射关系</th></tr></thead><tbody><tr><td>Counter</td><td>name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value</td></tr><tr><td>AutoGauge、Gauge</td><td>name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value</td></tr><tr><td>Histogram</td><td>name_max{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_sum{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_count{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.0&quot;} value <br> name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.25&quot;} value <br> name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.5&quot;} value <br> name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.75&quot;} value <br> name{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;1.0&quot;} value</td></tr><tr><td>Rate</td><td>name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, rate=&quot;m1&quot;} value <br> name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, rate=&quot;m5&quot;} value <br> name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, rate=&quot;m15&quot;} value <br> name_total{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, rate=&quot;mean&quot;} value</td></tr><tr><td>Timer</td><td>name_seconds_max{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_seconds_sum{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_seconds_count{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;} value <br> name_seconds{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.0&quot;} value <br> name_seconds{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.25&quot;} value <br> name_seconds{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.5&quot;} value <br> name_seconds{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;0.75&quot;} value <br> name_seconds{k1=&quot;V1&quot;, ..., Kn=&quot;Vn&quot;, quantile=&quot;1.0&quot;} value</td></tr></tbody></table><h4 id="_5-2-2-修改配置文件" tabindex="-1"><a class="header-anchor" href="#_5-2-2-修改配置文件"><span>5.2.2. 修改配置文件</span></a></h4><ol><li>以 DataNode 为例,修改 iotdb-datanode.properties 配置文件如下:</li></ol><div class="language-properties line-numbers-mode" data-ext="properties" data-title="properties"><pre class="language-properties"><code><span class="token key attr-name">dn_metric_reporter_list</span><span class="token punctuation">=</span><span class="token value attr-value">PROMETHEUS</span>
<span class="token key attr-name">dn_metric_level</span><span class="token punctuation">=</span><span class="token value attr-value">CORE</span>
<span class="token key attr-name">dn_metric_prometheus_reporter_port</span><span class="token punctuation">=</span><span class="token value attr-value">9091</span>
</code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><ol start="2"><li><p>启动 IoTDB DataNode</p></li><li><p>打开浏览器或者用<code>curl</code> 访问 <code>http://servier_ip:9091/metrics</code>, 就能得到如下 metric 数据:</p></li></ol><div class="language-text line-numbers-mode" data-ext="text" data-title="text"><pre class="language-text"><code>...
# HELP file_count
# TYPE file_count gauge
file_count{name=&quot;wal&quot;,} 0.0
file_count{name=&quot;unseq&quot;,} 0.0
file_count{name=&quot;seq&quot;,} 2.0
...
</code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><h4 id="_5-2-3-prometheus-grafana" tabindex="-1"><a class="header-anchor" href="#_5-2-3-prometheus-grafana"><span>5.2.3. Prometheus + Grafana</span></a></h4><p>如上所示,IoTDB 对外暴露出标准的 Prometheus 格式的监控指标数据,可以使用 Prometheus 采集并存储监控指标,使用 Grafana<br> 可视化监控指标。</p><p>IoTDB、Prometheus、Grafana三者的关系如下图所示:</p><figure><img src="https://alioss.timecho.com/docs/img/UserGuide/System-Tools/Metrics/iotdb_prometheus_grafana.png" alt="iotdb_prometheus_grafana" tabindex="0" loading="lazy"><figcaption>iotdb_prometheus_grafana</figcaption></figure><ol><li>IoTDB在运行过程中持续收集监控指标数据。</li><li>Prometheus以固定的间隔(可配置)从IoTDB的HTTP接口拉取监控指标数据。</li><li>Prometheus将拉取到的监控指标数据存储到自己的TSDB中。</li><li>Grafana以固定的间隔(可配置)从Prometheus查询监控指标数据并绘图展示。</li></ol><p>从交互流程可以看出,我们需要做一些额外的工作来部署和配置Prometheus和Grafana。</p><p>比如,你可以对Prometheus进行如下的配置(部分参数可以自行调整)来从IoTDB获取监控数据</p><div class="language-yaml line-numbers-mode" data-ext="yml" data-title="yml"><pre class="language-yaml"><code><span class="token key atrule">job_name</span><span class="token punctuation">:</span> pull<span class="token punctuation">-</span>metrics
<span class="token key atrule">honor_labels</span><span class="token punctuation">:</span> <span class="token boolean important">true</span>
<span class="token key atrule">honor_timestamps</span><span class="token punctuation">:</span> <span class="token boolean important">true</span>
<span class="token key atrule">scrape_interval</span><span class="token punctuation">:</span> 15s
<span class="token key atrule">scrape_timeout</span><span class="token punctuation">:</span> 10s
<span class="token key atrule">metrics_path</span><span class="token punctuation">:</span> /metrics
<span class="token key atrule">scheme</span><span class="token punctuation">:</span> http
<span class="token key atrule">follow_redirects</span><span class="token punctuation">:</span> <span class="token boolean important">true</span>
<span class="token key atrule">static_configs</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> <span class="token key atrule">targets</span><span class="token punctuation">:</span>
<span class="token punctuation">-</span> localhost<span class="token punctuation">:</span><span class="token number">9091</span>
</code></pre><div class="line-numbers" aria-hidden="true"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>更多细节可以参考下面的文档:</p>`,32),ke={href:"https://prometheus.io/docs/prometheus/latest/getting_started/",target:"_blank",rel:"noopener noreferrer"},Ce={href:"https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config",target:"_blank",rel:"noopener noreferrer"},Ge={href:"https://grafana.com/docs/grafana/latest/getting-started/getting-started/",target:"_blank",rel:"noopener noreferrer"},Ve={href:"https://prometheus.io/docs/visualization/grafana/#grafana-support-for-prometheus",target:"_blank",rel:"noopener noreferrer"},Ie=n('<h4 id="_5-2-4-apache-iotdb-dashboard" tabindex="-1"><a class="header-anchor" href="#_5-2-4-apache-iotdb-dashboard"><span>5.2.4. Apache IoTDB Dashboard</span></a></h4><p>我们提供了Apache IoTDB Dashboard,在Grafana中显示的效果图如下所示:</p><figure><img src="https://alioss.timecho.com/docs/img/UserGuide/System-Tools/Metrics/dashboard.png" alt="Apache IoTDB Dashboard" tabindex="0" loading="lazy"><figcaption>Apache IoTDB Dashboard</figcaption></figure><h5 id="_5-2-4-1-如何获取-apache-iotdb-dashboard" tabindex="-1"><a class="header-anchor" href="#_5-2-4-1-如何获取-apache-iotdb-dashboard"><span>5.2.4.1. 如何获取 Apache IoTDB Dashboard?</span></a></h5>',4),Me=t("li",null,[d("您可以在 GitHub 上获取到Dashboard的json文件。 "),t("ol",null,[t("li",null,[t("a",{href:"https://github.com/apache/iotdb/tree/rel/1.0/docs/UserGuide/Monitor-Alert/Apache-IoTDB-ConfigNode-Dashboard.json"},"Apache IoTDB ConfigNode Dashboard")]),t("li",null,[t("a",{href:"https://github.com/apache/iotdb/tree/rel/1.0/docs/UserGuide/Monitor-Alert/Apache-IoTDB-DataNode-Dashboard.json"},"Apache IoTDB DataNode Dashboard")])])],-1),Ae={href:"https://grafana.com/grafana/dashboards/",target:"_blank",rel:"noopener noreferrer"},xe=t("code",null,"Apache IoTDB Dashboard",-1),Pe=n('<p>在创建Grafana时,您可以选择Import刚刚下载的json文件,并为Apache IoTDB Dashboard选择对应目标数据源。</p><h5 id="_5-2-4-2-apache-iotdb-confignode-dashboard-说明" tabindex="-1"><a class="header-anchor" href="#_5-2-4-2-apache-iotdb-confignode-dashboard-说明"><span>5.2.4.2. Apache IoTDB ConfigNode Dashboard 说明</span></a></h5><blockquote><p>除特殊说明的监控项以外,以下监控项均保证在Important级别的监控框架中可用。</p></blockquote><ul><li><code>Overview</code>:系统概述 <ul><li><code>Registered Node</code>:注册的ConfigNode/DataNode个数</li><li><code>DataNode</code>:集群DataNode的存活状态,包括Online和Unknown两种。</li><li><code>ConfigNode</code>:集群ConfigNode的存活状态,包括Online和Unknown两种。</li><li><code>The Status Of Node</code>:集群具体节点运行状态,包括Online和Unkown两种。</li></ul></li><li><code>Region</code>:Region概述 <ul><li><code>Region Number</code>:Region个数,包括总个数,DataRegion 个数和 SchemaRegion 个数。</li><li><code>Leadership distribution</code>:集群 Leader 分布情况,指每个节点上对应 Region 的 Leader 的个数。</li><li><code>Total Region in Node</code>:不同 Node 的 Region 总数量。</li><li><code>Region in Node</code>:不同 Node 的 SchemaRegion/DataRegion 数量。</li><li><code>Region in Database</code>(Normal级别):不同 Database 的 Region 数量,包括 SchemaRegion、DataRegion。</li><li><code>Slot in Database</code>(Normal级别):不同 Database 的Slot数量,包括 DataSlot 数量和 SchemaSlot 数量。</li></ul></li><li><code>System</code>:系统 <ul><li><code>CPU Core</code>:系统 CPU 核数情况。</li><li><code>CPU Load</code>:系统 CPU 负载情况、进度 CPU 负载情况。</li><li><code>CPU Time Per Minute</code>:进程平均每分钟占用系统 CPU 时间,注意:多核会导致该值超过1分钟。</li><li><code>System Memory</code>:系统物理内存大小、系统使用物理内存大小、虚拟机提交的内存大小。</li><li><code>System Swap Size</code>:系统交换区总大小、系统交换区使用大小。</li><li><code>Process Memory</code>:IoTDB 进程最大内存总大小、IoTDB 进程总内存大小、IoTDB 进程使用内存大小。</li><li><code>The Number of GC Per Minute</code>:平均每分钟 GC 次数。</li><li><code>The Time Consumed Of GC Per Minute</code>:平均每分钟 GC 耗时。</li><li><code>The Number Of Java Thread</code>:IoTDB 进程的不同状态的线程数。</li><li><code>Heap Memory</code>:IoTDB 进程的堆内存</li><li><code>Off Heap Memory</code>:IoTDB 进程的堆外内存</li><li><code>Log Number Per Minute</code>:IoTDB 进程平均每分钟日志数</li><li><code>The Time Consumed of Compliation Per Minute</code>:平均每分钟编译耗时</li><li><code>The Number Of Class</code>:JVM 加载和卸载的类数量</li></ul></li></ul><h5 id="_5-2-4-3-apache-iotdb-datanode-dashboard-说明" tabindex="-1"><a class="header-anchor" href="#_5-2-4-3-apache-iotdb-datanode-dashboard-说明"><span>5.2.4.3. Apache IoTDB DataNode Dashboard 说明</span></a></h5><blockquote><p>除特殊说明的监控项以外,以下监控项均保证在Important级别的监控框架中可用。</p></blockquote><ul><li><code>Overview</code>:系统概述 <ul><li><code>The Number Of Entity</code>:实体数量,包含时间序列等</li><li><code>Write Point Per Minute</code>:每分钟系统平均写入点数</li><li><code>Database Used Memory</code>:每个 Database 使用的内存大小</li></ul></li><li><code>Interface</code>:接口 <ul><li><code>The Time Consumed Of Operation(50%)</code>:不同客户端操作耗时的中位数</li><li><code>The Time Consumed Of Operation(75%)</code>:不同客户端操作耗时的上四分位数</li><li><code>The Time Consumed Of Operation(100%)</code>:不同客户端操作耗时的最大值</li><li><code>The QPS of Interface</code>:系统接口每秒钟访问次数</li><li><code>The Time Consumed Of Interface</code>:系统接口的平均耗时</li><li><code>Cache Hit Rate</code>:缓存命中率</li><li><code>Thrift Connection</code>:建立的 Thrift 连接个数</li><li><code>Thrift Active Thread</code>:建立的活跃的 Thrift 连接的个数</li></ul></li><li><code>Engine</code>:引擎 <ul><li><code>Task Number</code>:系统中不同状态的任务个数</li><li><code>The Time Consumed Of Tasking</code>:系统中不同状态的任务的耗时</li><li><code>Compaction Read And Write Per Minute</code>:平均每分钟合并读取和写入数据量</li><li><code>Compaction R/W Ratio Per Minute</code>:平均每分钟合并读取和写入数据比</li><li><code>Compaction Number Per Minute</code>:平均每分钟不同类型的合并任务数量</li></ul></li><li><code>IoTConsensus</code>:IoT共识协议 <ul><li><code>IoTConsensus Used Memory</code>:IoT共识层使用的内存大小</li><li><code>IoTConsensus Sync Index</code>:不同的Region的写入Index和同步Index</li><li><code>IoTConsensus Overview</code>:不同节点的同步总差距、总缓存的请求个数</li><li><code>The time consumed of different stages(50%)</code>:不同阶段耗时的中位数</li><li><code>The time consumed of different stages(75%)</code>:不同阶段耗时的上四分位数</li><li><code>The time consumed of different stages(100%)</code>:不同阶段耗时的最大值</li><li><code>IoTConsensus Search Index Rate</code>:不同region的写入Index的增长速度</li><li><code>IoTConsensus Safe Index Rate</code>:不同region的同步Index的增长速度</li><li><code>IoTConsensus LogDispatcher Request Size</code>:不同的LogDispatcherThread缓存的请求个数</li><li><code>Sync Lag</code>:每个region的同步index差距</li><li><code>Min Peer Sync Lag</code>:每个region的写入index和同步最快的LogDispatcherThread的同步index之间的差距</li><li><code>Sync speed diff of Peers</code>:每个region中同步最快的LogDispatcherThread与同步最慢的LogDispatcherThread之间的同步index差距</li></ul></li><li><code>System</code>:系统 <ul><li><code>CPU Core</code>:系统 CPU 核数情况。</li><li><code>CPU Load</code>:系统 CPU 负载情况、进度 CPU 负载情况。</li><li><code>CPU Time Per Minute</code>:进程平均每分钟占用系统 CPU 时间,注意:多核会导致该值超过1分钟。</li><li><code>System Memory</code>:系统物理内存大小、系统使用物理内存大小、虚拟机提交的内存大小。</li><li><code>System Swap Size</code>:系统交换区总大小、系统交换区使用大小。</li><li><code>Process Memory</code>:IoTDB 进程最大内存总大小、IoTDB 进程总内存大小、IoTDB 进程使用内存大小。</li><li><code>The Size Of File</code>:IoTDB系统相关的文件大小,包括wal下的文件总大小、seq下的tsfile文件总大小、unseq下的tsfile文件总大小</li><li><code>The Number Of File</code>:IoTDB系统相关的文件个数,包括wal下的文件个数、seq下的tsfile文件个数、unseq下的tsfile文件个数</li><li><code>The Space Of Disk</code>:当前data目录所挂载的磁盘总大小和剩余大小</li><li><code>The Number of GC Per Minute</code>:平均每分钟 GC 次数。</li><li><code>The Time Consumed Of GC Per Minute</code>:平均每分钟 GC 耗时。</li><li><code>The Number Of Java Thread</code>:IoTDB 进程的不同状态的线程数。</li><li><code>Heap Memory</code>:IoTDB 进程的堆内存</li><li><code>Off Heap Memory</code>:IoTDB 进程的堆外内存</li><li><code>Log Number Per Minute</code>:IoTDB 进程平均每分钟日志数</li><li><code>The Time Consumed of Compliation Per Minute</code>:平均每分钟编译耗时</li><li><code>The Number Of Class</code>:JVM 加载和卸载的类数量</li></ul></li></ul><h3 id="_5-3-使用-iotdb-方式" tabindex="-1"><a class="header-anchor" href="#_5-3-使用-iotdb-方式"><span>5.3. 使用 IoTDB 方式</span></a></h3><h4 id="_5-3-1-监控指标的-iotdb-映射关系" tabindex="-1"><a class="header-anchor" href="#_5-3-1-监控指标的-iotdb-映射关系"><span>5.3.1. 监控指标的 IoTDB 映射关系</span></a></h4><blockquote><p>对于 Metric Name 为 name, Tags 为 K1=V1, ..., Kn=Vn 的监控指标有如下映射,以默认写到 root.__system.metric.<code>ip:port</code> 为例</p></blockquote><table><thead><tr><th>监控指标类型</th><th>映射关系</th></tr></thead><tbody><tr><td>Counter</td><td>root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.value</td></tr><tr><td>AutoGauge、Gauge</td><td>root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.value</td></tr><tr><td>Histogram</td><td>root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.count <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.max <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.sum <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p0 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p25 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p50 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p75 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p100</td></tr><tr><td>Rate</td><td>root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.count <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.mean <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m1 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m5 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m15</td></tr><tr><td>Timer</td><td>root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.count <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.max <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.mean <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.sum <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p0 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p25 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p50 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p75 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.p100 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m1 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m5 <br> root.__system.metric.<code>ip:port</code>.name.<code>K1=V1</code>...<code>Kn=Vn</code>.m15</td></tr></tbody></table><h4 id="_5-3-2-获取监控指标" tabindex="-1"><a class="header-anchor" href="#_5-3-2-获取监控指标"><span>5.3.2. 获取监控指标</span></a></h4><p>根据如上的映射关系,可以构成相关的 IoTDB 查询语句获取监控指标</p>',13);function Be(e,Ke){const a=i("ExternalLinkIcon");return r(),u("div",null,[h,t("p",null,[d("目前,IoTDB 对外提供一些主要模块的监控指标,并且随着新功能的开发以及系统优化或者重构,监控指标也会同步添加和更新。如果想自己在 IoTDB 中添加更多系统监控指标埋点,可以参考"),t("a",m,[d("IoTDB Metrics Framework"),l(a)]),d("使用说明。")]),_,p,g,t("table",null,[b,t("tbody",null,[q,f,t("tr",null,[y,t("td",null,'database="'+o(e.database)+'", type="flush"',1),T,D])])]),v,t("table",null,[k,t("tbody",null,[t("tr",null,[C,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'"',1),G,V]),t("tr",null,[I,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="ConfigNode/DataNode"',1),M,A])])]),x,t("table",null,[P,t("tbody",null,[B,K,S,N,R,t("tr",null,[j,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="SchemaRegion"',1),U,w]),t("tr",null,[O,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="DataRegion"',1),z,L])])]),H,t("table",null,[J,t("tbody",null,[t("tr",null,[F,t("td",null,'name="logDispatcher-'+o(e.IP)+":"+o(e.Port)+'", region="'+o(e.region)+'", type="currentSyncIndex"',1),E,X]),t("tr",null,[W,t("td",null,'name="logDispatcher-'+o(e.IP)+":"+o(e.Port)+'", region="'+o(e.region)+'", type="cachedRequestInMemoryQueue"',1),Q,Y]),t("tr",null,[Z,t("td",null,'name="IoTConsensusServerImpl", region="'+o(e.region)+'", type="searchIndex"',1),$,tt]),t("tr",null,[et,t("td",null,'name="IoTConsensusServerImpl", region="'+o(e.region)+'", type="safeIndex"',1),ot,dt]),t("tr",null,[nt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="getStateMachineLock"',1),at,lt]),t("tr",null,[st,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="checkingBeforeWrite"',1),it,rt]),t("tr",null,[ut,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="writeStateMachine"',1),ct,ht]),t("tr",null,[mt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="offerRequestToQueue"',1),_t,pt]),t("tr",null,[gt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="consensusWrite"',1),bt,qt]),t("tr",null,[ft,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="constructBatch"',1),yt,Tt]),t("tr",null,[Dt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="syncLogTimePerRequest"',1),vt,kt])])]),Ct,t("table",null,[Gt,t("tbody",null,[t("tr",null,[Vt,t("td",null,'name = "'+o(e.name)+'"',1),It,Mt]),t("tr",null,[At,t("td",null,'name="'+o(e.interface)+'"',1),xt,Pt]),Bt,Kt,St,Nt,Rt,jt,Ut,wt])]),Ot,t("table",null,[zt,t("tbody",null,[t("tr",null,[Lt,t("td",null,'name="database_'+o(e.name)+'"',1),Ht,Jt]),t("tr",null,[Ft,t("td",null,'name="chunkMetaData_'+o(e.name)+'"',1),Et,Xt]),Wt,Qt,Yt])]),Zt,t("table",null,[$t,t("tbody",null,[te,ee,t("tr",null,[oe,t("td",null,'action="'+o(e.action)+'",cause="'+o(e.cause)+'"',1),de,ne]),ae,le,se,ie,re])]),ue,ce,he,me,_e,pe,t("table",null,[ge,t("tbody",null,[t("tr",null,[be,t("td",null,'name="'+o(e.DatabaseName)+'",type="SchemaRegion/DataRegion"',1),qe,fe]),t("tr",null,[ye,t("td",null,'name="'+o(e.DatabaseName)+'",type="schemaSlotNumber/dataSlotNumber"',1),Te,De])])]),ve,t("p",null,[t("a",ke,[d("Prometheus安装使用文档"),l(a)])]),t("p",null,[t("a",Ce,[d("Prometheus从HTTP接口拉取metrics数据的配置说明"),l(a)])]),t("p",null,[t("a",Ge,[d("Grafana安装使用文档"),l(a)])]),t("p",null,[t("a",Ve,[d("Grafana从Prometheus查询数据并绘图的文档"),l(a)])]),Ie,t("ol",null,[Me,t("li",null,[d("您可以访问"),t("a",Ae,[d("Grafana Dashboard官网"),l(a)]),d("搜索"),xe,d("并使用")])]),Pe])}const Ne=s(c,[["render",Be],["__file","Metric-Tool.html.vue"]]),Re=JSON.parse('{"path":"/zh/UserGuide/V1.0.x/Monitor-Alert/Metric-Tool.html","title":"","lang":"zh-CN","frontmatter":{"description":"在 IoTDB 的运行过程中,我们希望对 IoTDB 的状态进行观测,以便于排查系统问题或者及时发现系统潜在的风险,能够**反映系统运行状态的一系列指标 **就是系统监控指标。 1. 什么场景下会使用到监控? 那么什么时候会用到监控框架呢?下面列举一些常见的场景。 系统变慢了 系统变慢几乎是最常见也最头疼的问题,这时候我们需要尽可能多的信息来帮助我们找...","head":[["link",{"rel":"alternate","hreflang":"en-us","href":"https://iotdb.apache.org/UserGuide/V1.0.x/Monitor-Alert/Metric-Tool.html"}],["meta",{"property":"og:url","content":"https://iotdb.apache.org/zh/UserGuide/V1.0.x/Monitor-Alert/Metric-Tool.html"}],["meta",{"property":"og:site_name","content":"IoTDB Website"}],["meta",{"property":"og:description","content":"在 IoTDB 的运行过程中,我们希望对 IoTDB 的状态进行观测,以便于排查系统问题或者及时发现系统潜在的风险,能够**反映系统运行状态的一系列指标 **就是系统监控指标。 1. 什么场景下会使用到监控? 那么什么时候会用到监控框架呢?下面列举一些常见的场景。 系统变慢了 系统变慢几乎是最常见也最头疼的问题,这时候我们需要尽可能多的信息来帮助我们找..."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:image","content":"https://alioss.timecho.com/docs/img/UserGuide/System-Tools/Metrics/iotdb_prometheus_grafana.png"}],["meta",{"property":"og:locale","content":"zh-CN"}],["meta",{"property":"og:locale:alternate","content":"en-US"}],["meta",{"property":"og:updated_time","content":"2023-07-10T03:11:17.000Z"}],["meta",{"property":"article:modified_time","content":"2023-07-10T03:11:17.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"\\",\\"image\\":[\\"https://alioss.timecho.com/docs/img/UserGuide/System-Tools/Metrics/iotdb_prometheus_grafana.png\\",\\"https://alioss.timecho.com/docs/img/UserGuide/System-Tools/Metrics/dashboard.png\\"],\\"dateModified\\":\\"2023-07-10T03:11:17.000Z\\",\\"author\\":[]}"]]},"headers":[{"level":2,"title":"1. 什么场景下会使用到监控?","slug":"_1-什么场景下会使用到监控","link":"#_1-什么场景下会使用到监控","children":[]},{"level":2,"title":"2. 什么人需要使用监控?","slug":"_2-什么人需要使用监控","link":"#_2-什么人需要使用监控","children":[]},{"level":2,"title":"3. 什么是监控指标?","slug":"_3-什么是监控指标","link":"#_3-什么是监控指标","children":[{"level":3,"title":"3.1. 监控指标名词解释","slug":"_3-1-监控指标名词解释","link":"#_3-1-监控指标名词解释","children":[]},{"level":3,"title":"3.2. 监控指标对外获取数据格式","slug":"_3-2-监控指标对外获取数据格式","link":"#_3-2-监控指标对外获取数据格式","children":[]}]},{"level":2,"title":"4. 监控指标有哪些?","slug":"_4-监控指标有哪些","link":"#_4-监控指标有哪些","children":[{"level":3,"title":"4.1. Core 级别监控指标","slug":"_4-1-core-级别监控指标","link":"#_4-1-core-级别监控指标","children":[]},{"level":3,"title":"4.2. Important 级别监控指标","slug":"_4-2-important-级别监控指标","link":"#_4-2-important-级别监控指标","children":[]},{"level":3,"title":"4.3. Normal 级别监控指标","slug":"_4-3-normal-级别监控指标","link":"#_4-3-normal-级别监控指标","children":[]},{"level":3,"title":"4.4. All 级别监控指标","slug":"_4-4-all-级别监控指标","link":"#_4-4-all-级别监控指标","children":[]}]},{"level":2,"title":"5. 怎样获取这些系统监控?","slug":"_5-怎样获取这些系统监控","link":"#_5-怎样获取这些系统监控","children":[{"level":3,"title":"5.1. 使用 JMX 方式","slug":"_5-1-使用-jmx-方式","link":"#_5-1-使用-jmx-方式","children":[]},{"level":3,"title":"5.2. 使用 Prometheus 方式","slug":"_5-2-使用-prometheus-方式","link":"#_5-2-使用-prometheus-方式","children":[]},{"level":3,"title":"5.3. 使用 IoTDB 方式","slug":"_5-3-使用-iotdb-方式","link":"#_5-3-使用-iotdb-方式","children":[]}]}],"git":{"createdTime":1688958677000,"updatedTime":1688958677000,"contributors":[{"name":"CritasWang","email":"critas@outlook.com","commits":1}]},"readingTime":{"minutes":20.28,"words":6083},"filePathRelative":"zh/UserGuide/V1.0.x/Monitor-Alert/Metric-Tool.md","localizedDate":"2023年7月10日","autoDesc":true}');export{Ne as comp,Re as data};