| import{_ as i,C as r,O as s,P as u,ah as c,Q as t,U as d,ai as l,S as o,aW as n}from"./framework-62ad666a.js";const h={},m=n('<p>在 IoTDB 的运行过程中,我们希望对 IoTDB 的状态进行观测,以便于排查系统问题或者及时发现系统潜在的风险,能够**反映系统运行状态的一系列指标 **就是系统监控指标。</p><h2 id="_1-什么场景下会使用到监控" tabindex="-1"><a class="header-anchor" href="#_1-什么场景下会使用到监控" aria-hidden="true">#</a> 1. 什么场景下会使用到监控?</h2><p>那么什么时候会用到监控框架呢?下面列举一些常见的场景。</p><ol><li><p>系统变慢了</p><p>系统变慢几乎是最常见也最头疼的问题,这时候我们需要尽可能多的信息来帮助我们找到系统变慢的原因,比如:</p><ul><li>JVM信息:是不是有FGC?GC耗时多少?GC后内存有没有恢复?是不是有大量的线程?</li><li>系统信息:CPU使用率是不是太高了?磁盘IO是不是很频繁?</li><li>连接数:当前连接是不是太多?</li><li>接口:当前TPS是多少?各个接口耗时有没有变化?</li><li>线程池:系统中各种任务是否有积压?</li><li>缓存命中率</li></ul></li><li><p>磁盘快满了</p><p>这时候我们迫切想知道最近一段时间数据文件的增长情况,看看是不是某种文件有突增。</p></li><li><p>系统运行是否正常</p><p>此时我们可能需要通过错误日志的数量、集群节点的状态等指标来判断系统是否在正常运行。</p></li></ol><h2 id="_2-什么人需要使用监控" tabindex="-1"><a class="header-anchor" href="#_2-什么人需要使用监控" aria-hidden="true">#</a> 2. 什么人需要使用监控?</h2><p>所有关注系统状态的人员都可以使用,包括但不限于研发、测试、运维、DBA等等</p><h2 id="_3-什么是监控指标" tabindex="-1"><a class="header-anchor" href="#_3-什么是监控指标" aria-hidden="true">#</a> 3. 什么是监控指标?</h2><h3 id="_3-1-监控指标名词解释" tabindex="-1"><a class="header-anchor" href="#_3-1-监控指标名词解释" aria-hidden="true">#</a> 3.1. 监控指标名词解释</h3><p>在 IoTDB 的监控模块,每个监控指标被 <code>Metric Name</code> 和 <code>Tags</code> 唯一标识。</p><ul><li><code>Metric Name</code>:指标类型名称,比如<code>logback_events</code>表示日志事件。</li><li><code>Tags</code>:指标分类,形式为Key-Value对,每个指标下面可以有0到多个分类,常见的Key-Value对: <ul><li><code>name = xxx</code>:被监控对象的名称,是对<strong>业务逻辑</strong>的说明。比如对于<code>Metric Name = entry_seconds_count</code> 类型的监控项,name的含义是指被监控的业务接口。</li><li><code>type = xxx</code>:监控指标类型细分,是对<strong>监控指标</strong>本身的说明。比如对于<code>Metric Name = point</code> 类型的监控项,type的含义是指监控具体是什么类型的点数。</li><li><code>status = xxx</code>:被监控对象的状态,是对<strong>业务逻辑</strong>的说明。比如对于<code>Metric Name = Task</code>类型的监控项可以通过该参数,从而区分被监控对象的状态。</li><li><code>user = xxx</code>:被监控对象的相关用户,是对<strong>业务逻辑</strong>的说明。比如统计<code>root</code>用户的写入总点数。</li><li>根据具体情况自定义:比如logback_events_total下有一个level的分类,用来表示特定级别下的日志数量。</li></ul></li><li><code>Metric Level</code>:<strong>指标管理级别</strong>,默认启动级别为<code>Core</code>级别,建议启动级别为<code>Important级别</code> ,审核严格程度<code>Core > Important > Normal > All</code><ul><li><code>Core</code>:系统的核心指标,供<strong>系统内核和运维人员</strong>使用,关乎系统的<strong>性能、稳定性、安全性</strong>,比如实例的状况,系统的负载等。</li><li><code>Important</code>:模块的重要指标,供<strong>运维和测试人员</strong>使用,直接关乎<strong>每个模块的运行状态</strong>,比如合并文件个数、执行情况等。</li><li><code>Normal</code>:模块的一般指标,供<strong>开发人员</strong>使用,方便在出现问题时<strong>定位模块</strong>,比如合并中的特定关键操作情况。</li><li><code>All</code>:模块的全部指标,供<strong>模块开发人员</strong>使用,往往在复现问题的时候使用,从而快速解决问题。</li></ul></li></ul><h3 id="_3-2-监控指标对外获取数据格式" tabindex="-1"><a class="header-anchor" href="#_3-2-监控指标对外获取数据格式" aria-hidden="true">#</a> 3.2. 监控指标对外获取数据格式</h3><ul><li>IoTDB 对外提供 JMX、 Prometheus 和 IoTDB 格式的监控指标: <ul><li>对于 JMX ,可以通过<code>org.apache.iotdb.metrics</code>获取系统监控指标指标。</li><li>对于 Prometheus ,可以通过对外暴露的端口获取监控指标的值</li><li>对于 IoTDB 方式对外暴露:可以通过执行 IoTDB 的查询来获取监控指标</li></ul></li></ul><h2 id="_4-监控指标有哪些" tabindex="-1"><a class="header-anchor" href="#_4-监控指标有哪些" aria-hidden="true">#</a> 4. 监控指标有哪些?</h2>',13),_={href:"https://github.com/apache/iotdb/tree/master/metrics",target:"_blank",rel:"noopener noreferrer"},p=t("h3",{id:"_4-1-core-级别监控指标",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-1-core-级别监控指标","aria-hidden":"true"},"#"),d(" 4.1. Core 级别监控指标")],-1),g=t("p",null,"Core 级别的监控指标在系统运行中默认开启,每一个 Core 级别的监控指标的添加都需要经过谨慎的评估,目前 Core 级别的监控指标如下所述:",-1),b=t("h4",{id:"_4-1-1-集群运行状态",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-1-1-集群运行状态","aria-hidden":"true"},"#"),d(" 4.1.1. 集群运行状态")],-1),q=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),f=t("tr",null,[t("td",null,"config_node"),t("td",null,'name="total",status="Registered/Online/Unknown"'),t("td",null,"AutoGauge"),t("td",null,"已注册/在线/离线 confignode 的节点数量")],-1),y=t("tr",null,[t("td",null,"data_node"),t("td",null,'name="total",status="Registered/Online/Unknown"'),t("td",null,"AutoGauge"),t("td",null,"已注册/在线/离线 datanode 的节点数量")],-1),T=t("td",null,"points",-1),D=t("td",null,"Gauge",-1),v=t("td",null,"最新一个刷盘的memtale的点数",-1),C=n('<h4 id="_4-1-2-iotdb-进程运行状态" tabindex="-1"><a class="header-anchor" href="#_4-1-2-iotdb-进程运行状态" aria-hidden="true">#</a> 4.1.2. IoTDB 进程运行状态</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>process_cpu_load</td><td>name="process"</td><td>AutoGauge</td><td>IoTDB 进程的 CPU 占用率,单位为%</td></tr><tr><td>process_cpu_time</td><td>name="process"</td><td>AutoGauge</td><td>IoTDB 进程占用的 CPU 时间,单位为ns</td></tr><tr><td>process_max_mem</td><td>name="memory"</td><td>AutoGauge</td><td>IoTDB 进程最大可用内存</td></tr><tr><td>process_total_mem</td><td>name="memory"</td><td>AutoGauge</td><td>IoTDB 进程当前已申请内存</td></tr><tr><td>process_free_mem</td><td>name="memory"</td><td>AutoGauge</td><td>IoTDB 进程当前剩余可用内存</td></tr></tbody></table><h4 id="_4-1-3-系统运行状态" tabindex="-1"><a class="header-anchor" href="#_4-1-3-系统运行状态" aria-hidden="true">#</a> 4.1.3. 系统运行状态</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>sys_cpu_load</td><td>name="system"</td><td>AutoGauge</td><td>系统的 CPU 占用率,单位为%</td></tr><tr><td>sys_cpu_cores</td><td>name="system"</td><td>Gauge</td><td>系统的可用处理器数</td></tr><tr><td>sys_total_physical_memory_size</td><td>name="memory"</td><td>Gauge</td><td>系统的最大物理内存</td></tr><tr><td>sys_free_physical_memory_size</td><td>name="memory"</td><td>AutoGauge</td><td>系统的剩余可用内存</td></tr><tr><td>sys_total_swap_space_size</td><td>name="memory"</td><td>AutoGauge</td><td>系统的交换区最大空间</td></tr><tr><td>sys_free_swap_space_size</td><td>name="memory"</td><td>AutoGauge</td><td>系统的交换区剩余可用空间</td></tr><tr><td>sys_committed_vm_size</td><td>name="memory"</td><td>AutoGauge</td><td>系统保证可用于正在运行的进程的虚拟内存量</td></tr><tr><td>sys_disk_total_space</td><td>name="disk"</td><td>AutoGauge</td><td>系统磁盘总大小</td></tr><tr><td>sys_disk_free_space</td><td>name="disk"</td><td>AutoGauge</td><td>系统磁盘可用大小</td></tr></tbody></table><h3 id="_4-2-important-级别监控指标" tabindex="-1"><a class="header-anchor" href="#_4-2-important-级别监控指标" aria-hidden="true">#</a> 4.2. Important 级别监控指标</h3><p>目前 Important 级别的监控指标如下所述:</p><h4 id="_4-2-1-集群运行状态" tabindex="-1"><a class="header-anchor" href="#_4-2-1-集群运行状态" aria-hidden="true">#</a> 4.2.1. 集群运行状态</h4>',7),k=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),G=t("td",null,"cluster_node_leader_count",-1),V=t("td",null,"Gauge",-1),I=t("td",null,"节点上共识组Leader的数量",-1),A=t("td",null,"cluster_node_status",-1),M=t("td",null,"Gauge",-1),x=t("td",null,"节点的状态,0=Unkonwn 1=online",-1),P=t("h4",{id:"_4-2-2-节点统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-2-节点统计","aria-hidden":"true"},"#"),d(" 4.2.2. 节点统计")],-1),K=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),S=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="database"'),t("td",null,"AutoGauge"),t("td",null,"系统数据库数量")],-1),B=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="timeSeries"'),t("td",null,"AutoGauge"),t("td",null,"系统时间序列数量")],-1),N=t("tr",null,[t("td",null,"quantity"),t("td",null,'name="pointsIn"'),t("td",null,"Counter"),t("td",null,"系统累计写入点数")],-1),R=t("tr",null,[t("td",null,"region"),t("td",null,'name="total",type="SchemaRegion"'),t("td",null,"AutoGauge"),t("td",null,"分区表中 SchemaRegion 总数量")],-1),w=t("tr",null,[t("td",null,"region"),t("td",null,'name="total",type="DataRegion"'),t("td",null,"AutoGauge"),t("td",null,"分区表中 DataRegion 总数量")],-1),O=t("td",null,"region",-1),j=t("td",null,"Gauge",-1),U=t("td",null,"分区表中对应节点上 DataRegion 总数量",-1),L=t("td",null,"region",-1),H=t("td",null,"Gauge",-1),z=t("td",null,"分区表中对应节点上 DataRegion 总数量",-1),F=t("h4",{id:"_4-2-3-iot共识协议统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-3-iot共识协议统计","aria-hidden":"true"},"#"),d(" 4.2.3. IoT共识协议统计")],-1),J=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),E=t("td",null,"iot_consensus",-1),W=t("td",null,"AutoGauge",-1),X=t("td",null,"副本组同步线程的当前同步进度",-1),Q=t("td",null,"iot_consensus",-1),Y=t("td",null,"AutoGauge",-1),Z=t("td",null,"副本组同步线程缓存队列请求总大小",-1),$=t("td",null,"iot_consensus",-1),tt=t("td",null,"AutoGauge",-1),et=t("td",null,"副本组主流程写入进度",-1),ot=t("td",null,"iot_consensus",-1),dt=t("td",null,"AutoGauge",-1),nt=t("td",null,"副本组同步进度",-1),at=t("td",null,"stage",-1),lt=t("td",null,"Histogram",-1),it=t("td",null,"主流程获取状态机锁耗时",-1),rt=t("td",null,"stage",-1),st=t("td",null,"Histogram",-1),ut=t("td",null,"主流程写入状态机检查耗时",-1),ct=t("td",null,"stage",-1),ht=t("td",null,"Histogram",-1),mt=t("td",null,"主流程写入状态机耗时",-1),_t=t("td",null,"stage",-1),pt=t("td",null,"Histogram",-1),gt=t("td",null,"主流程尝试添加队列耗时",-1),bt=t("td",null,"stage",-1),qt=t("td",null,"Histogram",-1),ft=t("td",null,"主流程全写入耗时",-1),yt=t("td",null,"stage",-1),Tt=t("td",null,"Histogram",-1),Dt=t("td",null,"同步线程构造 Batch 耗时",-1),vt=t("td",null,"stage",-1),Ct=t("td",null,"Histogram",-1),kt=t("td",null,"异步回调流程同步日志耗时",-1),Gt=n('<h4 id="_4-2-4-缓存统计" tabindex="-1"><a class="header-anchor" href="#_4-2-4-缓存统计" aria-hidden="true">#</a> 4.2.4. 缓存统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>cache_hit</td><td>name="chunk"</td><td>AutoGauge</td><td>ChunkCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name="schema"</td><td>AutoGauge</td><td>SchemaCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name="timeSeriesMeta"</td><td>AutoGauge</td><td>TimeseriesMetadataCache的命中率,单位为%</td></tr><tr><td>cache_hit</td><td>name="bloomFilter"</td><td>AutoGauge</td><td>TimeseriesMetadataCache中的bloomFilter的拦截率,单位为%</td></tr><tr><td>cache</td><td>name="Database", type="hit"</td><td>Counter</td><td>Database Cache 的命中次数</td></tr><tr><td>cache</td><td>name="Database", type="all"</td><td>Counter</td><td>Database Cache 的访问次数</td></tr><tr><td>cache</td><td>name="SchemaPartition", type="hit"</td><td>Counter</td><td>SchemaPartition Cache 的命中次数</td></tr><tr><td>cache</td><td>name="SchemaPartition", type="all"</td><td>Counter</td><td>SchemaPartition Cache 的访问次数</td></tr><tr><td>cache</td><td>name="DataPartition", type="hit"</td><td>Counter</td><td>DataPartition Cache 的命中次数</td></tr><tr><td>cache</td><td>name="DataPartition", type="all"</td><td>Counter</td><td>DataPartition Cache 的访问次数</td></tr></tbody></table><h4 id="_4-2-5-接口层统计" tabindex="-1"><a class="header-anchor" href="#_4-2-5-接口层统计" aria-hidden="true">#</a> 4.2.5. 接口层统计</h4>',3),Vt=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),It=t("td",null,"operation",-1),At=t("td",null,"Histogram",-1),Mt=t("td",null,"客户端执行的操作的耗时情况",-1),xt=t("td",null,"entry",-1),Pt=t("td",null,"Timer",-1),Kt=t("td",null,"Client 建立的 Thrift 的耗时情况",-1),St=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="ConfigNodeRPC"'),t("td",null,"AutoGauge"),t("td",null,"ConfigNode 的内部 Thrift 连接数")],-1),Bt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="Internal"'),t("td",null,"AutoGauge"),t("td",null,"DataNode 的内部 Thrift 连接数")],-1),Nt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="MPPDataExchange"'),t("td",null,"AutoGauge"),t("td",null,"MPP 框架的内部 Thrift 连接数")],-1),Rt=t("tr",null,[t("td",null,"thrift_connections"),t("td",null,'name="RPC"'),t("td",null,"AutoGauge"),t("td",null,"Client 建立的 Thrift 连接数")],-1),wt=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="ConfigNodeRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"ConfigNode 的内部活跃 Thrift 连接数")],-1),Ot=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="DataNodeInternalRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"DataNode 的内部活跃 Thrift 连接数")],-1),jt=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="MPPDataExchangeRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"MPP 框架的内部活跃 Thrift 连接数")],-1),Ut=t("tr",null,[t("td",null,"thrift_active_threads"),t("td",null,'name="ClientRPC-Service"'),t("td",null,"AutoGauge"),t("td",null,"Client 建立的活跃 Thrift 连接数")],-1),Lt=t("h4",{id:"_4-2-6-内存统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-6-内存统计","aria-hidden":"true"},"#"),d(" 4.2.6. 内存统计")],-1),Ht=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),zt=t("td",null,"mem",-1),Ft=t("td",null,"AutoGauge",-1),Jt=t("td",null,"DataNode内对应DataRegion的内存占用,单位为byte",-1),Et=t("td",null,"mem",-1),Wt=t("td",null,"AutoGauge",-1),Xt=t("td",null,"写入TsFile时的ChunkMetaData的内存占用,单位为byte",-1),Qt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="IoTConsensus"'),t("td",null,"AutoGauge"),t("td",null,"IoT共识协议的内存占用,单位为byte")],-1),Yt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="schema_region_total_usage"'),t("td",null,"AutoGauge"),t("td",null,"所有SchemaRegion的总内存占用,单位为byte")],-1),Zt=t("tr",null,[t("td",null,"mem"),t("td",null,'name="schema_region_total_remaining"'),t("td",null,"AutoGauge"),t("td",null,"所有SchemaRegion的总内存剩余,单位为byte")],-1),$t=n('<h4 id="_4-2-7-任务统计" tabindex="-1"><a class="header-anchor" href="#_4-2-7-任务统计" aria-hidden="true">#</a> 4.2.7. 任务统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>queue</td><td>name="compaction_inner", status="running/waiting"</td><td>Gauge</td><td>空间内合并任务数</td></tr><tr><td>queue</td><td>name="compaction_cross", status="running/waiting"</td><td>Gauge</td><td>跨空间合并任务数</td></tr><tr><td>cost_task</td><td>name="inner_compaction/cross_compaction/flush"</td><td>Gauge</td><td>任务耗时情况</td></tr><tr><td>queue</td><td>name="flush",status="running/waiting"</td><td>AutoGauge</td><td>刷盘任务数</td></tr><tr><td>queue</td><td>name="Sub_RawQuery",status="running/waiting"</td><td>AutoGauge</td><td>Sub_RawQuery任务数</td></tr></tbody></table><h4 id="_4-2-8-合并统计" tabindex="-1"><a class="header-anchor" href="#_4-2-8-合并统计" aria-hidden="true">#</a> 4.2.8. 合并统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>data_written</td><td>name="compaction", type="aligned/not-aligned/total"</td><td>Counter</td><td>合并时写入量</td></tr><tr><td>data_read</td><td>name="compaction"</td><td>Counter</td><td>合并时的读取量</td></tr><tr><td>compaction_task_count</td><td>name = "inner_compaction", type="sequence"</td><td>Counter</td><td>顺序空间内合并次数</td></tr><tr><td>compaction_task_count</td><td>name = "inner_compaction", type="unsequence"</td><td>Counter</td><td>乱序空间内合并次数</td></tr><tr><td>compaction_task_count</td><td>name = "cross_compaction", type="cross"</td><td>Counter</td><td>跨空间合并次数</td></tr></tbody></table><h4 id="_4-2-9-文件统计信息" tabindex="-1"><a class="header-anchor" href="#_4-2-9-文件统计信息" aria-hidden="true">#</a> 4.2.9. 文件统计信息</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>file_size</td><td>name="wal"</td><td>AutoGauge</td><td>写前日志总大小,单位为byte</td></tr><tr><td>file_size</td><td>name="seq"</td><td>AutoGauge</td><td>顺序TsFile总大小,单位为byte</td></tr><tr><td>file_size</td><td>name="unseq"</td><td>AutoGauge</td><td>乱序TsFile总大小,单位为byte</td></tr><tr><td>file_size</td><td>name="inner-seq-temp"</td><td>AutoGauge</td><td>顺序空间内合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name="inner-unseq-temp"</td><td>AutoGauge</td><td>乱序空间内合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name="cross-temp"</td><td>AutoGauge</td><td>跨空间合并临时文件大小,单位为byte</td></tr><tr><td>file_size</td><td>name="mods"</td><td>AutoGauge</td><td>Modification 文件的大小</td></tr><tr><td>file_count</td><td>name="wal"</td><td>AutoGauge</td><td>写前日志文件个数</td></tr><tr><td>file_count</td><td>name="seq"</td><td>AutoGauge</td><td>顺序TsFile文件个数</td></tr><tr><td>file_count</td><td>name="unseq"</td><td>AutoGauge</td><td>乱序TsFile文件个数</td></tr><tr><td>file_count</td><td>name="inner-seq-temp"</td><td>AutoGauge</td><td>顺序空间内合并临时文件个数</td></tr><tr><td>file_count</td><td>name="inner-unseq-temp"</td><td>AutoGauge</td><td>乱序空间内合并临时文件个数</td></tr><tr><td>file_count</td><td>name="cross-temp"</td><td>AutoGauge</td><td>跨空间合并临时文件个数</td></tr><tr><td>file_count</td><td>name="open_file_handlers"</td><td>AutoGauge</td><td>IoTDB 进程打开文件数,仅支持Linux和MacOS</td></tr><tr><td>file_count</td><td>name="mods</td><td>AutoGauge</td><td>Modification 文件的数目</td></tr></tbody></table><h4 id="_4-2-10-iotdb-进程统计" tabindex="-1"><a class="header-anchor" href="#_4-2-10-iotdb-进程统计" aria-hidden="true">#</a> 4.2.10. IoTDB 进程统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>process_used_mem</td><td>name="memory"</td><td>AutoGauge</td><td>IoTDB 进程当前使用内存</td></tr><tr><td>process_mem_ratio</td><td>name="memory"</td><td>AutoGauge</td><td>IoTDB 进程的内存占用比例</td></tr><tr><td>process_threads_count</td><td>name="process"</td><td>AutoGauge</td><td>IoTDB 进程当前线程数</td></tr><tr><td>process_status</td><td>name="process"</td><td>AutoGauge</td><td>IoTDB 进程存活状态,1为存活,0为终止</td></tr></tbody></table><h4 id="_4-2-11-iotdb-日志统计" tabindex="-1"><a class="header-anchor" href="#_4-2-11-iotdb-日志统计" aria-hidden="true">#</a> 4.2.11. IoTDB 日志统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>logback_events</td><td>level="trace/debug/info/warn/error"</td><td>Counter</td><td>不同类型的日志个数</td></tr></tbody></table><h4 id="_4-2-12-jvm-线程统计" tabindex="-1"><a class="header-anchor" href="#_4-2-12-jvm-线程统计" aria-hidden="true">#</a> 4.2.12. JVM 线程统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>jvm_threads_live_threads</td><td></td><td>AutoGauge</td><td>当前线程数</td></tr><tr><td>jvm_threads_daemon_threads</td><td></td><td>AutoGauge</td><td>当前 Daemon 线程数</td></tr><tr><td>jvm_threads_peak_threads</td><td></td><td>AutoGauge</td><td>峰值线程数</td></tr><tr><td>jvm_threads_states_threads</td><td>state="runnable/blocked/waiting/timed-waiting/new/terminated"</td><td>AutoGauge</td><td>当前处于各种状态的线程数</td></tr></tbody></table><h4 id="_4-2-13-jvm-gc-统计" tabindex="-1"><a class="header-anchor" href="#_4-2-13-jvm-gc-统计" aria-hidden="true">#</a> 4.2.13. JVM GC 统计</h4>',13),te=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),ee=t("tr",null,[t("td",null,"jvm_gc_pause"),t("td",null,'action="end of major GC/end of minor GC",cause="xxxx"'),t("td",null,"Timer"),t("td",null,"不同原因的Young GC/Full GC的次数与耗时")],-1),oe=t("tr",null,[t("td"),t("td"),t("td"),t("td")],-1),de=t("td",null,"jvm_gc_concurrent_phase_time",-1),ne=t("td",null,"Timer",-1),ae=t("td",null,"不同原因的Young GC/Full GC的次数与耗时",-1),le=t("tr",null,[t("td"),t("td"),t("td"),t("td")],-1),ie=t("tr",null,[t("td",null,"jvm_gc_max_data_size_bytes"),t("td"),t("td",null,"AutoGauge"),t("td",null,"老年代内存的历史最大值")],-1),re=t("tr",null,[t("td",null,"jvm_gc_live_data_size_bytes"),t("td"),t("td",null,"AutoGauge"),t("td",null,"老年代内存的使用值")],-1),se=t("tr",null,[t("td",null,"jvm_gc_memory_promoted_bytes"),t("td"),t("td",null,"Counter"),t("td",null,"老年代内存正向增长累计值")],-1),ue=t("tr",null,[t("td",null,"jvm_gc_memory_allocated_bytes"),t("td"),t("td",null,"Counter"),t("td",null,"GC分配内存正向增长累计值")],-1),ce=t("h4",{id:"_4-2-14-jvm-内存统计",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-2-14-jvm-内存统计","aria-hidden":"true"},"#"),d(" 4.2.14. JVM 内存统计")],-1),he=t("table",null,[t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])]),t("tbody",null,[t("tr",null,[t("td",null,"jvm_buffer_memory_used_bytes"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"已经使用的缓冲区大小")]),t("tr",null,[t("td",null,"jvm_buffer_total_capacity_bytes"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"最大缓冲区大小")]),t("tr",null,[t("td",null,"jvm_buffer_count_buffers"),t("td",null,'id="direct/mapped"'),t("td",null,"AutoGauge"),t("td",null,"当前缓冲区数量")]),t("tr",null,[t("td",null,"jvm_memory_committed_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"当前申请的内存大小")]),t("tr",null,[t("td",null,"jvm_memory_max_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"最大内存")]),t("tr",null,[t("td",null,"jvm_memory_used_bytes"),t("td",{area:'heap/nonheap,id="xxx",'}),t("td",null,"AutoGauge"),t("td",null,"已使用内存大小")])])],-1),me=n('<h4 id="_4-2-15-jvm-类加载统计" tabindex="-1"><a class="header-anchor" href="#_4-2-15-jvm-类加载统计" aria-hidden="true">#</a> 4.2.15. JVM 类加载统计</h4><table><thead><tr><th>Metric</th><th>Tags</th><th>Type</th><th>Description</th></tr></thead><tbody><tr><td>jvm_classes_unloaded_classes</td><td></td><td>AutoGauge</td><td>累计卸载的class数量</td></tr><tr><td>jvm_classes_loaded_classes</td><td></td><td>AutoGauge</td><td>累计加载的class数量</td></tr></tbody></table><h4 id="_4-2-16-jvm-编译时间统计" tabindex="-1"><a class="header-anchor" href="#_4-2-16-jvm-编译时间统计" aria-hidden="true">#</a> 4.2.16. JVM 编译时间统计</h4>',3),_e=t("table",null,[t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])]),t("tbody",null,[t("tr",null,[t("td",null,"jvm_compilation_time_ms"),t("td",{compiler:"HotSpot 64-Bit Tiered Compilers,"}),t("td",null,"AutoGauge"),t("td",null,"耗费在编译上的时间")])])],-1),pe=t("h3",{id:"_4-3-normal-级别监控指标",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-3-normal-级别监控指标","aria-hidden":"true"},"#"),d(" 4.3. Normal 级别监控指标")],-1),ge=t("h4",{id:"_4-3-1-集群",tabindex:"-1"},[t("a",{class:"header-anchor",href:"#_4-3-1-集群","aria-hidden":"true"},"#"),d(" 4.3.1. 集群")],-1),be=t("thead",null,[t("tr",null,[t("th",null,"Metric"),t("th",null,"Tags"),t("th",null,"Type"),t("th",null,"Description")])],-1),qe=t("td",null,"region",-1),fe=t("td",null,"AutoGauge",-1),ye=t("td",null,"特定节点上不同 Database 的 DataRegion/SchemaRegion 个数",-1),Te=t("td",null,"slot",-1),De=t("td",null,"AutoGauge",-1),ve=t("td",null,"特定节点上不同 Database 的 DataSlot/SchemaSlot 个数",-1),Ce=n(`<h3 id="_4-4-all-级别监控指标" tabindex="-1"><a class="header-anchor" href="#_4-4-all-级别监控指标" aria-hidden="true">#</a> 4.4. All 级别监控指标</h3><p>目前还没有All级别的监控指标,后续会持续添加。</p><h2 id="_5-怎样获取这些系统监控" tabindex="-1"><a class="header-anchor" href="#_5-怎样获取这些系统监控" aria-hidden="true">#</a> 5. 怎样获取这些系统监控?</h2><ul><li>监控模块的相关配置均在<code>conf/iotdb-{datanode/confignode}.properties</code>中,所有配置项支持通过<code>load configuration</code>命令热加载。</li></ul><h3 id="_5-1-使用-jmx-方式" tabindex="-1"><a class="header-anchor" href="#_5-1-使用-jmx-方式" aria-hidden="true">#</a> 5.1. 使用 JMX 方式</h3><p>对于使用 JMX 对外暴露的指标,可以通过 Jconsole 来进行查看。在进入 Jconsole 监控页面后,首先会看到 IoTDB 的各类运行情况的概览。在这里,您可以看到堆内存信息、线程信息、类信息以及服务器的 CPU 使用情况。</p><h4 id="_5-1-1-获取监控指标数据" tabindex="-1"><a class="header-anchor" href="#_5-1-1-获取监控指标数据" aria-hidden="true">#</a> 5.1.1. 获取监控指标数据</h4><p>连接到 JMX 后,您可以通过 "MBeans" 标签找到名为 "org.apache.iotdb.metrics" 的 "MBean",可以在侧边栏中查看所有监控指标的具体值。</p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" alt="metric-jmx" src="https://alioss.timecho.com/docs/img/github/204018765-6fda9391-ebcf-4c80-98c5-26f34bd74df0.png"><h4 id="_5-1-2-获取其他相关数据" tabindex="-1"><a class="header-anchor" href="#_5-1-2-获取其他相关数据" aria-hidden="true">#</a> 5.1.2. 获取其他相关数据</h4><p>连接到 JMX 后,您可以通过 "MBeans" 标签找到名为 "org.apache.iotdb.service" 的 "MBean",如下图所示,了解服务的基本状态</p><p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" src="https://alioss.timecho.com/docs/img/github/149951720-707f1ee8-32ee-4fde-9252-048caebd232e.png"> <br></p><p>为了提高查询性能,IOTDB 对 ChunkMetaData 和 TsFileMetaData 进行了缓存。用户可以使用 MXBean ,展开侧边栏<code>org.apache.iotdb.db.service</code>查看缓存命中率:</p><img style="width:100%;max-width:800px;max-height:600px;margin-left:auto;margin-right:auto;display:block;" src="https://alioss.timecho.com/docs/img/github/112426760-73e3da80-8d73-11eb-9a8f-9232d1f2033b.png"><h3 id="_5-2-使用-prometheus-方式" tabindex="-1"><a class="header-anchor" href="#_5-2-使用-prometheus-方式" aria-hidden="true">#</a> 5.2. 使用 Prometheus 方式</h3><h4 id="_5-2-1-监控指标的-prometheus-映射关系" tabindex="-1"><a class="header-anchor" href="#_5-2-1-监控指标的-prometheus-映射关系" aria-hidden="true">#</a> 5.2.1. 监控指标的 Prometheus 映射关系</h4><blockquote><p>对于 Metric Name 为 name, Tags 为 K1=V1, ..., Kn=Vn 的监控指标有如下映射,其中 value 为具体值</p></blockquote><table><thead><tr><th>监控指标类型</th><th>映射关系</th></tr></thead><tbody><tr><td>Counter</td><td>name_total{k1="V1", ..., Kn="Vn"} value</td></tr><tr><td>AutoGauge、Gauge</td><td>name{k1="V1", ..., Kn="Vn"} value</td></tr><tr><td>Histogram</td><td>name_max{k1="V1", ..., Kn="Vn"} value <br> name_sum{k1="V1", ..., Kn="Vn"} value <br> name_count{k1="V1", ..., Kn="Vn"} value <br> name{k1="V1", ..., Kn="Vn", quantile="0.0"} value <br> name{k1="V1", ..., Kn="Vn", quantile="0.25"} value <br> name{k1="V1", ..., Kn="Vn", quantile="0.5"} value <br> name{k1="V1", ..., Kn="Vn", quantile="0.75"} value <br> name{k1="V1", ..., Kn="Vn", quantile="1.0"} value</td></tr><tr><td>Rate</td><td>name_total{k1="V1", ..., Kn="Vn"} value <br> name_total{k1="V1", ..., Kn="Vn", rate="m1"} value <br> name_total{k1="V1", ..., Kn="Vn", rate="m5"} value <br> name_total{k1="V1", ..., Kn="Vn", rate="m15"} value <br> name_total{k1="V1", ..., Kn="Vn", rate="mean"} value</td></tr><tr><td>Timer</td><td>name_seconds_max{k1="V1", ..., Kn="Vn"} value <br> name_seconds_sum{k1="V1", ..., Kn="Vn"} value <br> name_seconds_count{k1="V1", ..., Kn="Vn"} value <br> name_seconds{k1="V1", ..., Kn="Vn", quantile="0.0"} value <br> name_seconds{k1="V1", ..., Kn="Vn", quantile="0.25"} value <br> name_seconds{k1="V1", ..., Kn="Vn", quantile="0.5"} value <br> name_seconds{k1="V1", ..., Kn="Vn", quantile="0.75"} value <br> name_seconds{k1="V1", ..., Kn="Vn", quantile="1.0"} value</td></tr></tbody></table><h4 id="_5-2-2-修改配置文件" tabindex="-1"><a class="header-anchor" href="#_5-2-2-修改配置文件" aria-hidden="true">#</a> 5.2.2. 修改配置文件</h4><ol><li>以 DataNode 为例,修改 iotdb-datanode.properties 配置文件如下:</li></ol><div class="language-properties line-numbers-mode" data-ext="properties"><pre class="language-properties"><code><span class="token key attr-name">dn_metric_reporter_list</span><span class="token punctuation">=</span><span class="token value attr-value">PROMETHEUS</span> |
| `),m,t("p",null,[d("目前,IoTDB 对外提供一些主要模块的监控指标,并且随着新功能的开发以及系统优化或者重构,监控指标也会同步添加和更新。如果想自己在 IoTDB 中添加更多系统监控指标埋点,可以参考"),t("a",_,[d("IoTDB Metrics Framework"),l(a)]),d("使用说明。")]),p,g,b,t("table",null,[q,t("tbody",null,[f,y,t("tr",null,[T,t("td",null,'database="'+o(e.database)+'", type="flush"',1),D,v])])]),C,t("table",null,[k,t("tbody",null,[t("tr",null,[G,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'"',1),V,I]),t("tr",null,[A,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="ConfigNode/DataNode"',1),M,x])])]),P,t("table",null,[K,t("tbody",null,[S,B,N,R,w,t("tr",null,[O,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="SchemaRegion"',1),j,U]),t("tr",null,[L,t("td",null,'name="'+o(e.ip)+":"+o(e.port)+'",type="DataRegion"',1),H,z])])]),F,t("table",null,[J,t("tbody",null,[t("tr",null,[E,t("td",null,'name="logDispatcher-'+o(e.IP)+":"+o(e.Port)+'", region="'+o(e.region)+'", type="currentSyncIndex"',1),W,X]),t("tr",null,[Q,t("td",null,'name="logDispatcher-'+o(e.IP)+":"+o(e.Port)+'", region="'+o(e.region)+'", type="cachedRequestInMemoryQueue"',1),Y,Z]),t("tr",null,[$,t("td",null,'name="IoTConsensusServerImpl", region="'+o(e.region)+'", type="searchIndex"',1),tt,et]),t("tr",null,[ot,t("td",null,'name="IoTConsensusServerImpl", region="'+o(e.region)+'", type="safeIndex"',1),dt,nt]),t("tr",null,[at,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="getStateMachineLock"',1),lt,it]),t("tr",null,[rt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="checkingBeforeWrite"',1),st,ut]),t("tr",null,[ct,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="writeStateMachine"',1),ht,mt]),t("tr",null,[_t,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="offerRequestToQueue"',1),pt,gt]),t("tr",null,[bt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="consensusWrite"',1),qt,ft]),t("tr",null,[yt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="constructBatch"',1),Tt,Dt]),t("tr",null,[vt,t("td",null,'name="iot_consensus", region="'+o(e.region)+'", type="syncLogTimePerRequest"',1),Ct,kt])])]),Gt,t("table",null,[Vt,t("tbody",null,[t("tr",null,[It,t("td",null,'name = "'+o(e.name)+'"',1),At,Mt]),t("tr",null,[xt,t("td",null,'name="'+o(e.interface)+'"',1),Pt,Kt]),St,Bt,Nt,Rt,wt,Ot,jt,Ut])]),Lt,t("table",null,[Ht,t("tbody",null,[t("tr",null,[zt,t("td",null,'name="database_'+o(e.name)+'"',1),Ft,Jt]),t("tr",null,[Et,t("td",null,'name="chunkMetaData_'+o(e.name)+'"',1),Wt,Xt]),Qt,Yt,Zt])]),$t,t("table",null,[te,t("tbody",null,[ee,oe,t("tr",null,[de,t("td",null,'action="'+o(e.action)+'",cause="'+o(e.cause)+'"',1),ne,ae]),le,ie,re,se,ue])]),ce,he,me,_e,pe,ge,t("table",null,[be,t("tbody",null,[t("tr",null,[qe,t("td",null,'name="'+o(e.DatabaseName)+'",type="SchemaRegion/DataRegion"',1),fe,ye]),t("tr",null,[Te,t("td",null,'name="'+o(e.DatabaseName)+'",type="schemaSlotNumber/dataSlotNumber"',1),De,ve])])]),Ce,t("p",null,[t("a",ke,[d("Prometheus安装使用文档"),l(a)])]),t("p",null,[t("a",Ge,[d("Prometheus从HTTP接口拉取metrics数据的配置说明"),l(a)])]),t("p",null,[t("a",Ve,[d("Grafana安装使用文档"),l(a)])]),t("p",null,[t("a",Ie,[d("Grafana从Prometheus查询数据并绘图的文档"),l(a)])]),Ae,t("ol",null,[Me,t("li",null,[d("您可以访问"),t("a",xe,[d("Grafana Dashboard官网"),l(a)]),d("搜索"),Pe,d("并使用")])]),Ke])}const Re=i(h,[["render",Se],["__file","Metric-Tool.html.vue"]]);export{Re as default}; |