blob: e41adb5a0cbb15118e80c20cdb03668b95f94cd3 [file] [log] [blame]
{
"AMBARI_METRICS": {
"service": [
{
"name": "metrics_monitor_process_percent",
"label": "Percent Metrics Monitors Available",
"description": "This alert is triggered if a percentage of Metrics Monitor processes are not up and listening on the network for the configured warning and critical thresholds.",
"interval": 1,
"scope": "SERVICE",
"enabled": true,
"source": {
"type": "AGGREGATE",
"alert_name": "ams_metrics_monitor_process",
"reporting": {
"ok": {
"text": "affected: [{1}], total: [{0}]"
},
"warning": {
"text": "affected: [{1}], total: [{0}]",
"value": 10
},
"critical": {
"text": "affected: [{1}], total: [{0}]",
"value": 30
},
"units" : "%",
"type": "PERCENT"
}
}
}
],
"METRICS_COLLECTOR": [
{
"name": "ams_metrics_collector_autostart",
"label": "Metrics Collector - Auto-Restart Status",
"description": "This alert is triggered if the Metrics Collector has been restarted automatically too frequently in last one hour. By default, a Warning alert is triggered if restarted twice in one hour and a Critical alert is triggered if restarted 4 or more times in one hour.",
"interval": 1,
"scope": "ANY",
"enabled": true,
"source": {
"type": "RECOVERY",
"reporting": {
"ok": {
"text": "Metrics Collector has not been auto-started and is running normally{0}."
},
"warning": {
"text": "Metrics Collector has been auto-started {1} times{0}.",
"count": 2
},
"critical": {
"text": "Metrics Collector has been auto-started {1} times{0}.",
"count": 4
}
}
}
},
{
"name": "ams_metrics_collector_process",
"label": "Metrics Collector Process",
"description": "This alert is triggered if the Metrics Collector cannot be confirmed to be up and listening on the configured port for number of seconds equal to threshold.",
"interval": 1,
"scope": "ANY",
"enabled": true,
"source": {
"type": "PORT",
"uri": "{{ams-site/timeline.metrics.service.webapp.address}}",
"default_port": 6188,
"reporting": {
"ok": {
"text": "TCP OK - {0:.3f}s response on port {1}"
},
"warning": {
"text": "TCP OK - {0:.3f}s response on port {1}",
"value": 1.5
},
"critical": {
"text": "Connection failed: {0} to {1}:{2}",
"value": 5.0
}
}
}
},
{
"name": "ams_metrics_collector_hbase_master_process",
"label": "Metrics Collector - HBase Master Process",
"description": "This alert is triggered if the Metrics Collector's HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "PORT",
"uri": "{{ams-hbase-site/hbase.master.info.port}}",
"default_port": 61310,
"reporting": {
"ok": {
"text": "TCP OK - {0:.3f}s response on port {1}"
},
"warning": {
"text": "TCP OK - {0:.3f}s response on port {1}",
"value": 1.5
},
"critical": {
"text": "Connection failed: {0} to {1}:{2}",
"value": 5.0
}
}
}
},
{
"name": "ams_metrics_collector_hbase_master_cpu",
"label": "Metrics Collector - HBase Master CPU Utilization",
"description": "This host-level alert is triggered if CPU utilization of the Metrics Collector's HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property. The threshold values are in percent.",
"interval": 5,
"scope": "ANY",
"enabled": true,
"source": {
"type": "METRIC",
"uri": {
"http": "{{ams-hbase-site/hbase.master.info.port}}",
"default_port": 61310,
"connection_timeout": 5.0
},
"reporting": {
"ok": {
"text": "{1} CPU, load {0:.1%}"
},
"warning": {
"text": "{1} CPU, load {0:.1%}",
"value": 200
},
"critical": {
"text": "{1} CPU, load {0:.1%}",
"value": 250
},
"units" : "%",
"type": "PERCENT"
},
"jmx": {
"property_list": [
"java.lang:type=OperatingSystem/SystemCpuLoad",
"java.lang:type=OperatingSystem/AvailableProcessors"
],
"value": "{0} * 100"
}
}
}
],
"METRICS_MONITOR": [
{
"name": "ams_metrics_monitor_process",
"label": "Metrics Monitor Status",
"description": "This alert indicates the status of the Metrics Monitor process as determined by the monitor status script.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "SCRIPT",
"path": "AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py"
}
}
],
"METRICS_GRAFANA": [
{
"name": "grafana_webui",
"label": "Grafana Web UI",
"description": "This host-level alert is triggered if the Grafana Web UI is unreachable.",
"interval": 1,
"scope": "ANY",
"source": {
"type": "WEB",
"uri": {
"http": "{{ams-grafana-ini/port}}",
"https": "{{ams-grafana-ini/port}}",
"https_property": "{{ams-grafana-ini/protocol}}",
"https_property_value": "https",
"connection_timeout": 5.0,
"default_port": 3000
},
"reporting": {
"ok": {
"text": "HTTP {0} response in {2:.3f}s"
},
"warning":{
"text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
},
"critical": {
"text": "Connection failed to {1} ({3})"
}
}
}
}
]
}
}