blob: 5acd2802875fd40f4c12bc04f11e01ee84714a1b [file] [log] [blame]
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#
# NAGIOS SERVER Check (status log update)
<%if scope.function_hdp_nagios_members_exist('nagios-server')-%>
define service {
name hadoop-service
use generic-service
notification_options w,u,c
first_notification_delay 0
notification_interval 0 # Send the notification once
}
define service {
hostgroup_name nagios-server
use hadoop-service
service_description NAGIOS::Nagios status log staleness
servicegroups NAGIOS
check_command check_nagios!10!/var/nagios/status.dat!<%=nagios_lookup_daemon_str%>
normal_check_interval 5
retry_check_interval 0.5
max_check_attempts 2
}
# NAGIOS SERVER HDFS Checks
define service {
hostgroup_name nagios-server
use hadoop-service
service_description HDFS::Percent DataNodes storage full
servicegroups HDFS
check_command check_aggregate!"DATANODE::DataNode storage full"!10%!30%
normal_check_interval 2
retry_check_interval 1
max_check_attempts 1
}
define service {
hostgroup_name nagios-server
use hadoop-service
service_description HDFS::Percent DataNodes down
servicegroups HDFS
check_command check_aggregate!"DATANODE::DataNode process down"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
# NAGIOS SERVER MAPREDUCE Checks
define service {
hostgroup_name nagios-server
use hadoop-service
service_description MAPREDUCE::Percent TaskTrackers down
servicegroups MAPREDUCE
check_command check_aggregate!"TASKTRACKER::TaskTracker process down"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
# NAGIOS SERVER ZOOKEEPER Checks
<%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%>
define service {
hostgroup_name nagios-server
use hadoop-service
service_description ZOOKEEPER::Percent ZooKeeper Servers down
servicegroups ZOOKEEPER
check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process down"!35%!70%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
<%end-%>
# NAGIOS SERVER HBASE Checks
<%if scope.function_hdp_nagios_members_exist('hbasemaster')-%>
define service {
hostgroup_name nagios-server
use hadoop-service
service_description HBASE::Percent RegionServers down
servicegroups HBASE
check_command check_aggregate!"REGIONSERVER::RegionServer process down"!10%!30%
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
<%end-%>
<%end-%>
# GANGLIA SERVER Checks
<%if scope.function_hdp_nagios_members_exist('ganglia-server')-%>
define service {
hostgroup_name ganglia-server
use hadoop-service
service_description GANGLIA::Ganglia [gmetad] process down
servicegroups GANGLIA
check_command check_tcp!8651!-w 1 -c 1
normal_check_interval 0.25
retry_check_interval 0.25
max_check_attempts 4
}
define service {
hostgroup_name ganglia-server
use hadoop-service
service_description GANGLIA::Ganglia Collector [gmond] process down alert for slaves
servicegroups GANGLIA
check_command check_tcp!8660!-w 1 -c 1
normal_check_interval 0.25
retry_check_interval 0.25
max_check_attempts 4
}
define service {
hostgroup_name ganglia-server
use hadoop-service
service_description GANGLIA::Ganglia Collector [gmond] process down alert for NameNode
servicegroups GANGLIA
check_command check_tcp!8661!-w 1 -c 1
normal_check_interval 0.25
retry_check_interval 0.25
max_check_attempts 4
}
define service {
hostgroup_name ganglia-server
use hadoop-service
service_description GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker
servicegroups GANGLIA
check_command check_tcp!8662!-w 1 -c 1
normal_check_interval 0.25
retry_check_interval 0.25
max_check_attempts 4
}
<%if scope.function_hdp_nagios_members_exist('hbasemaster')-%>
define service {
hostgroup_name ganglia-server
use hadoop-service
service_description GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master
servicegroups GANGLIA
check_command check_tcp!8663!-w 1 -c 1
normal_check_interval 0.25
retry_check_interval 0.25
max_check_attempts 4
}
<%end-%>
<%end-%>
<%if scope.function_hdp_nagios_members_exist('snamenode')-%>
# Secondary namenode checks
define service {
hostgroup_name snamenode
use hadoop-service
service_description NAMENODE::Secondary NameNode process down
servicegroups HDFS
check_command check_tcp!50090!-w 1 -c 1
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('namenode')-%>
# HDFS Checks
define service {
hostgroup_name namenode
use hadoop-service
service_description NAMENODE::NameNode Web UI down
servicegroups HDFS
check_command check_webui!namenode
normal_check_interval 1
retry_check_interval 1
max_check_attempts 3
}
define service {
hostgroup_name namenode
use hadoop-service
service_description NAMENODE::NameNode edit logs directory status
servicegroups HDFS
check_command check_name_dir_status!50070
normal_check_interval 0.5
retry_check_interval 0.5
max_check_attempts 3
}
<% if scope.function_hdp_template_var("hdp_os_type") != "suse"%>
define service {
hostgroup_name namenode
use hadoop-service
service_description NAMENODE::NameNode host CPU utilization
servicegroups HDFS
check_command check_cpu!200%!250%
normal_check_interval 5
retry_check_interval 2
max_check_attempts 5
}
<% end %>
define service {
hostgroup_name namenode
use hadoop-service
service_description NAMENODE::NameNode process down
servicegroups HDFS
check_command check_tcp!8020!-w 1 -c 1
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 3
}
define service {
hostgroup_name namenode
use hadoop-service
service_description HDFS::Corrupt/Missing blocks
servicegroups HDFS
check_command check_hdfs_blocks!50070!0%!0%
normal_check_interval 2
retry_check_interval 1
max_check_attempts 1
}
define service {
hostgroup_name namenode
use hadoop-service
service_description HDFS::HDFS capacity utilization
servicegroups HDFS
check_command check_hdfs_capacity!50070!80%!90%
normal_check_interval 10
retry_check_interval 1
max_check_attempts 1
}
define service {
hostgroup_name namenode
use hadoop-service
service_description HDFS::NameNode RPC latency
servicegroups HDFS
check_command check_rpcq_latency!NameNode!50070!3000!5000
normal_check_interval 5
retry_check_interval 1
max_check_attempts 5
}
<%end-%>
# MAPREDUCE Checks
<%if scope.function_hdp_nagios_members_exist('jobtracker')-%>
define service {
hostgroup_name jobtracker
use hadoop-service
service_description JOBTRACKER::JobTracker Web UI down
servicegroups MAPREDUCE
check_command check_webui!jobtracker
normal_check_interval 1
retry_check_interval 1
max_check_attempts 3
}
define service {
hostgroup_name jobtracker
use hadoop-service
service_description JOBTRACKER::JobHistory Web UI down
servicegroups MAPREDUCE
check_command check_webui!jobhistory
normal_check_interval 1
retry_check_interval 1
max_check_attempts 3
}
<% if scope.function_hdp_template_var("hdp_os_type") != "suse"%>
define service {
hostgroup_name jobtracker
use hadoop-service
service_description JOBTRACKER::JobTracker CPU utilization
servicegroups MAPREDUCE
check_command check_cpu!200%!250%
normal_check_interval 5
retry_check_interval 2
max_check_attempts 5
}
<% end %>
define service {
hostgroup_name jobtracker
use hadoop-service
service_description JOBTRACKER::JobTracker process down
servicegroups MAPREDUCE
check_command check_tcp!50030!-w 1 -c 1
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 4
}
define service {
hostgroup_name jobtracker
use hadoop-service
service_description MAPREDUCE::JobTracker RPC latency
servicegroups MAPREDUCE
check_command check_rpcq_latency!JobTracker!50030!3000!5000
normal_check_interval 5
retry_check_interval 1
max_check_attempts 5
}
# MAPREDUCE::TASKTRACKER Checks
define service {
hostgroup_name slaves
use hadoop-service
service_description TASKTRACKER::TaskTracker process down
servicegroups MAPREDUCE
check_command check_tcp!50060!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('slaves')-%>
# HDFS::DATANODE Checks
define service {
hostgroup_name slaves
use hadoop-service
service_description DATANODE::DataNode process down
servicegroups HDFS
check_command check_tcp!<%=scope.function_hdp_template_var("dfs_datanode_address")%>!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
}
define service {
hostgroup_name slaves
use hadoop-service
service_description DATANODE::DataNode storage full
servicegroups HDFS
check_command check_datanode_storage!<%=scope.function_hdp_template_var("dfs_datanode_http_address")%>!90%!90%
normal_check_interval 5
retry_check_interval 1
max_check_attempts 2
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%>
# ZOOKEEPER Checks
define service {
hostgroup_name zookeeper-servers
use hadoop-service
service_description ZOOKEEPER::ZooKeeper Server process down
servicegroups ZOOKEEPER
check_command check_tcp!2181!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('hbasemaster')-%>
# HBASE::REGIONSERVER Checks
define service {
hostgroup_name region-servers
use hadoop-service
service_description REGIONSERVER::RegionServer process down
servicegroups HBASE
check_command check_tcp!60020!-w 1 -c 1
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
}
# HBASE:: MASTER Checks
define service {
hostgroup_name hbasemaster
use hadoop-service
service_description HBASEMASTER::HBase Master Web UI down
servicegroups HBASE
check_command check_webui!hbase
normal_check_interval 1
retry_check_interval 1
max_check_attempts 3
}
<% if scope.function_hdp_template_var("hdp_os_type") != "suse"%>
define service {
hostgroup_name hbasemaster
use hadoop-service
service_description HBASEMASTER::HBase Master CPU utilization
servicegroups HBASE
check_command check_cpu!200%!250%
normal_check_interval 5
retry_check_interval 2
max_check_attempts 5
}
<% end %>
define service {
hostgroup_name hbasemaster
use hadoop-service
service_description HBASEMASTER::HBase Master process down
servicegroups HBASE
check_command check_tcp!60000!-w 1 -c 1
normal_check_interval 0.5
retry_check_interval 0.25
max_check_attempts 4
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('hiveserver')-%>
# HIVE Metastore check
define service {
hostgroup_name hiveserver
use hadoop-service
service_description HIVE-METASTORE::Hive Metastore status check
servicegroups HIVE-METASTORE
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_hive_metastore_status!9083!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%>
<%else-%>
check_command check_hive_metastore_status!9083!<%=scope.function_hdp_template_var("java64_home")%>!false
<%end-%>
normal_check_interval 0.5
retry_check_interval 0.5
max_check_attempts 3
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('oozie-server')-%>
# Oozie check
define service {
hostgroup_name oozie-server
use hadoop-service
service_description OOZIE::Oozie Server status check
servicegroups OOZIE
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%>
<%else-%>
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java64_home")%>!false
<%end-%>
normal_check_interval 1
retry_check_interval 1
max_check_attempts 3
}
<%end-%>
<%if scope.function_hdp_nagios_members_exist('webhcat-server')-%>
# WEBHCAT check
define service {
hostgroup_name webhcat-server
use hadoop-service
service_description WEBHCAT::WebHCat Server status check
servicegroups WEBHCAT
<%if scope.function_hdp_template_var("security_enabled")-%>
check_command check_templeton_status!50111!v1!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%>
<%else-%>
check_command check_templeton_status!50111!v1!false
<%end-%>
normal_check_interval 1
retry_check_interval 0.5
max_check_attempts 3
}
<%end-%>