blob: df91f9ac6c0effddc9250e710f58e8c1b2d8beea [file] [log] [blame]
{
"layouts": [
{
"layout_name": "default_yarn_dashboard",
"display_name": "Standard YARN Dashboard",
"section_name": "YARN_SUMMARY",
"widgetLayoutInfo": [
{
"widget_name": "Memory Utilization",
"description": "Percentage of total memory allocated to containers running in the cluster.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "yarn.QueueMetrics.Queue=root.AllocatedMB",
"metric_path": "metrics/yarn/Queue/root/AllocatedMB",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AvailableMB",
"metric_path": "metrics/yarn/Queue/root/AvailableMB",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
}
],
"values": [
{
"name": "Memory Utilization",
"value": "${(yarn.QueueMetrics.Queue=root.AllocatedMB / (yarn.QueueMetrics.Queue=root.AllocatedMB + yarn.QueueMetrics.Queue=root.AvailableMB)) * 100}"
}
],
"properties": {
"display_unit": "%",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "CPU Utilization",
"description": "Percentage of total virtual cores allocated to containers running in the cluster.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "yarn.QueueMetrics.Queue=root.AllocatedVCores",
"metric_path": "metrics/yarn/Queue/root/AllocatedVCores",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AvailableVCores",
"metric_path": "metrics/yarn/Queue/root/AvailableVCores",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
}
],
"values": [
{
"name": "Total Allocatable CPU Utilized across NodeManager",
"value": "${(yarn.QueueMetrics.Queue=root.AllocatedVCores / (yarn.QueueMetrics.Queue=root.AllocatedVCores + yarn.QueueMetrics.Queue=root.AvailableVCores)) * 100}"
}
],
"properties": {
"display_unit": "%",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Container Failures",
"description": "Percentage of all containers failing in the cluster.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.ContainersFailed._rate",
"metric_path": "metrics/yarn/ContainersFailed._rate",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersCompleted._rate",
"metric_path": "metrics/yarn/ContainersCompleted._rate",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersLaunched._rate",
"metric_path": "metrics/yarn/ContainersLaunched._rate",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersIniting._sum",
"metric_path": "metrics/yarn/ContainersIniting._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersKilled._rate",
"metric_path": "metrics/yarn/ContainersKilled._rate",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersRunning._sum",
"metric_path": "metrics/yarn/ContainersRunning._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Container Failures",
"value": "${(yarn.NodeManagerMetrics.ContainersFailed._rate/(yarn.NodeManagerMetrics.ContainersFailed._rate + yarn.NodeManagerMetrics.ContainersCompleted._rate + yarn.NodeManagerMetrics.ContainersLaunched._rate + yarn.NodeManagerMetrics.ContainersIniting._sum + yarn.NodeManagerMetrics.ContainersKilled._rate + yarn.NodeManagerMetrics.ContainersRunning._sum)) * 100}"
}
],
"properties": {
"display_unit": "%",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "App Failures",
"description": "Percentage of all launched applications failing in the cluster.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "yarn.QueueMetrics.Queue=root.AppsFailed._rate",
"metric_path": "metrics/yarn/Queue/root/AppsFailed._rate",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AppsKilled._rate",
"metric_path": "metrics/yarn/Queue/root/AppsKilled._rate",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AppsPending",
"metric_path": "metrics/yarn/Queue/root/AppsPending",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AppsRunning",
"metric_path": "metrics/yarn/Queue/root/AppsRunning",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AppsSubmitted._rate",
"metric_path": "metrics/yarn/Queue/root/AppsSubmitted._rate",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
},
{
"name": "yarn.QueueMetrics.Queue=root.AppsCompleted._rate",
"metric_path": "metrics/yarn/Queue/root/AppsCompleted._rate",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
}
],
"values": [
{
"name": "App Failures",
"value": "${(yarn.QueueMetrics.Queue=root.AppsFailed._rate/(yarn.QueueMetrics.Queue=root.AppsFailed._rate + yarn.QueueMetrics.Queue=root.AppsKilled._rate + yarn.QueueMetrics.Queue=root.AppsPending + yarn.QueueMetrics.Queue=root.AppsRunning + yarn.QueueMetrics.Queue=root.AppsSubmitted._rate + yarn.QueueMetrics.Queue=root.AppsCompleted._rate)) * 100}"
}
],
"properties": {
"display_unit": "%",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Pending Apps",
"description": "Count of applications waiting for cluster resources to become available.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "yarn.QueueMetrics.Queue=root.AppsPending",
"metric_path": "metrics/yarn/Queue/root/AppsPending",
"service_name": "YARN",
"component_name": "RESOURCEMANAGER",
"host_component_criteria": "host_components/HostRoles/ha_state=ACTIVE"
}
],
"values": [
{
"name": "Pending Apps",
"value": "${yarn.QueueMetrics.Queue=root.AppsPending}"
}
],
"properties": {
"display_unit": "Apps",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Cluster Memory",
"description": "Percentage of memory used across all NodeManager hosts.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "mem_total._sum",
"metric_path": "metrics/memory/mem_total._avg",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "mem_free._sum",
"metric_path": "metrics/memory/mem_free._avg",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Memory utilization",
"value": "${((mem_total._sum - mem_free._sum)/mem_total._sum) * 100}"
}
],
"properties": {
"display_unit": "%",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Cluster Disk",
"description": "Sum of disk throughput for all NodeManager hosts.",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "read_bps._sum",
"metric_path": "metrics/disk/read_bps._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "write_bps._sum",
"metric_path": "metrics/disk/write_bps._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Read throughput",
"value": "${read_bps._sum/1048576}"
},
{
"name": "Write throughput",
"value": "${write_bps._sum/1048576}"
}
],
"properties": {
"display_unit": "Mbps",
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Cluster Network",
"description": "Average of Network utilized across all NodeManager hosts.",
"default_section_name": "YARN_SUMMARY",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "pkts_in._avg",
"metric_path": "metrics/network/pkts_in._avg",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "pkts_out._avg",
"metric_path": "metrics/network/pkts_out._avg",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Packets In",
"value": "${pkts_in._avg}"
},
{
"name": "Packets Out",
"value": "${pkts_out._avg}"
}
],
"properties": {
"graph_type": "LINE",
"time_range": "1"
}
},
{
"widget_name": "Cluster CPU",
"description": "Percentage of CPU utilized across all NodeManager hosts.",
"default_section_name": "YARN_SUMMARY",
"widget_type": "GRAPH",
"is_visible": true,
"metrics": [
{
"name": "cpu_system._sum",
"metric_path": "metrics/cpu/cpu_system._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "cpu_user._sum",
"metric_path": "metrics/cpu/cpu_user._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "cpu_nice._sum",
"metric_path": "metrics/cpu/cpu_nice._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "cpu_idle._sum",
"metric_path": "metrics/cpu/cpu_idle._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "cpu_wio._sum",
"metric_path": "metrics/cpu/cpu_wio._sum",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "CPU utilization",
"value": "${((cpu_system._sum + cpu_user._sum + cpu_nice._sum)/(cpu_system._sum + cpu_user._sum + cpu_nice._sum + cpu_idle._sum + cpu_wio._sum)) * 100}"
}
],
"properties": {
"graph_type": "LINE",
"time_range": "1",
"display_unit": "%"
}
}
]
},
{
"layout_name": "default_yarn_heatmap",
"display_name": "YARN Heatmaps",
"section_name": "YARN_HEATMAPS",
"widgetLayoutInfo": [
{
"widget_name": "Total Allocatable RAM Utilized per NodeManager",
"description": "",
"widget_type": "HEATMAP",
"is_visible": true,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.AllocatedGB",
"metric_path": "metrics/yarn/AllocatedGB",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.AvailableGB",
"metric_path": "metrics/yarn/AvailableGB",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Total Allocatable RAM Utilized per NodeManager",
"value": "${(yarn.NodeManagerMetrics.AllocatedGB/(yarn.NodeManagerMetrics.AvailableGB + yarn.NodeManagerMetrics.AllocatedGB)) * 100}"
}
],
"properties": {
"display_unit": "%",
"max_limit": "100"
}
},
{
"widget_name": "Total Allocatable CPU Utilized per NodeManager",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.AllocatedVCores",
"metric_path": "metrics/yarn/AllocatedVCores",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.AvailableVCores",
"metric_path": "metrics/yarn/AvailableVCores",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Total Allocatable CPU Utilized per NodeManager",
"value": "${(yarn.NodeManagerMetrics.AllocatedVCores/(yarn.NodeManagerMetrics.AllocatedVCores + yarn.NodeManagerMetrics.AvailableVCores)) * 100}"
}
],
"properties": {
"display_unit": "%",
"max_limit": "100"
}
},
{
"widget_name": "Container Failures",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.ContainersFailed",
"metric_path": "metrics/yarn/ContainersFailed",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersCompleted",
"metric_path": "metrics/yarn/ContainersCompleted",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersLaunched",
"metric_path": "metrics/yarn/ContainersLaunched",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersIniting",
"metric_path": "metrics/yarn/ContainersIniting",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersKilled",
"metric_path": "metrics/yarn/ContainersKilled",
"service_name": "YARN",
"component_name": "NODEMANAGER"
},
{
"name": "yarn.NodeManagerMetrics.ContainersRunning",
"metric_path": "metrics/yarn/ContainersRunning",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Container Failures",
"value": "${(yarn.NodeManagerMetrics.ContainersFailed/(yarn.NodeManagerMetrics.ContainersFailed + yarn.NodeManagerMetrics.ContainersCompleted + yarn.NodeManagerMetrics.ContainersLaunched + yarn.NodeManagerMetrics.ContainersIniting + yarn.NodeManagerMetrics.ContainersKilled + yarn.NodeManagerMetrics.ContainersRunning)) * 100}"
}
],
"properties": {
"display_unit": "%",
"max_limit": "100"
}
},
{
"widget_name": "NodeManager GC Time",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "Hadoop:service=NodeManager,name=JvmMetrics.GcTimeMillis",
"metric_path": "metrics/jvm/gcTimeMillis",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "NodeManager Garbage Collection Time",
"value": "${Hadoop:service=NodeManager,name=JvmMetrics.GcTimeMillis}"
}
],
"properties": {
"display_unit": "ms",
"max_limit": "10000"
}
},
{
"widget_name": "NodeManager JVM Heap Memory Used",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "Hadoop:service=NodeManager,name=JvmMetrics.MemHeapUsedM",
"metric_path": "metrics/jvm/memHeapUsedM",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "NodeManager JVM Heap Memory Used",
"value": "${Hadoop:service=NodeManager,name=JvmMetrics.MemHeapUsedM}"
}
],
"properties": {
"display_unit": "MB",
"max_limit": "512"
}
},
{
"widget_name": "Allocated Containers",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.AllocatedContainers",
"metric_path": "metrics/yarn/AllocatedContainers",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "Allocated Containers",
"value": "${yarn.NodeManagerMetrics.AllocatedContainers}"
}
],
"properties": {
"display_unit": "",
"max_limit": "100"
}
},
{
"widget_name": "NodeManager RAM Utilized",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.AllocatedGB",
"metric_path": "metrics/yarn/AllocatedGB",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "NodeManager RAM Utilized",
"value": "${yarn.NodeManagerMetrics.AllocatedGB}"
}
],
"properties": {
"display_unit": "",
"max_limit": "100"
}
},
{
"widget_name": "NodeManager CPU Utilized",
"description": "",
"widget_type": "HEATMAP",
"is_visible": false,
"metrics": [
{
"name": "yarn.NodeManagerMetrics.AllocatedVCores",
"metric_path": "metrics/yarn/AllocatedVCores",
"service_name": "YARN",
"component_name": "NODEMANAGER"
}
],
"values": [
{
"name": "NodeManager CPU Utilized",
"value": "${yarn.NodeManagerMetrics.AllocatedVCores}"
}
],
"properties": {
"display_unit": "",
"max_limit": "100"
}
}
]
}
]
}