| # |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # |
| |
| |
| # NAGIOS SERVER Check (status log update) |
| <%if scope.function_hdp_nagios_members_exist('nagios-server')-%> |
| define service { |
| name hadoop-service |
| use generic-service |
| notification_options w,u,c |
| first_notification_delay 0 |
| notification_interval 0 # Send the notification once |
| } |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description NAGIOS::Nagios status log staleness |
| servicegroups NAGIOS |
| check_command check_nagios!10!/var/nagios/status.dat!<%=nagios_lookup_daemon_str%> |
| normal_check_interval 5 |
| retry_check_interval 0.5 |
| max_check_attempts 2 |
| } |
| |
| # NAGIOS SERVER HDFS Checks |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Percent DataNodes storage full |
| servicegroups HDFS |
| check_command check_aggregate!"DATANODE::DataNode storage full"!10%!30% |
| normal_check_interval 2 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Percent DataNodes down |
| servicegroups HDFS |
| check_command check_aggregate!"DATANODE::DataNode process down"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| # NAGIOS SERVER MAPREDUCE Checks |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description MAPREDUCE::Percent TaskTrackers down |
| servicegroups MAPREDUCE |
| check_command check_aggregate!"TASKTRACKER::TaskTracker process down"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| # NAGIOS SERVER ZOOKEEPER Checks |
| <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description ZOOKEEPER::Percent ZooKeeper Servers down |
| servicegroups ZOOKEEPER |
| check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process down"!35%!70% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| # NAGIOS SERVER HBASE Checks |
| <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HBASE::Percent RegionServers down |
| servicegroups HBASE |
| check_command check_aggregate!"REGIONSERVER::RegionServer process down"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%end-%> |
| |
| # GANGLIA SERVER Checks |
| <%if scope.function_hdp_nagios_members_exist('ganglia-server')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia [gmetad] process down |
| servicegroups GANGLIA |
| check_command check_tcp!8651!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Collector [gmond] process down alert for slaves |
| servicegroups GANGLIA |
| check_command check_tcp!8660!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Collector [gmond] process down alert for NameNode |
| servicegroups GANGLIA |
| check_command check_tcp!8661!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker |
| servicegroups GANGLIA |
| check_command check_tcp!8662!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master |
| servicegroups GANGLIA |
| check_command check_tcp!8663!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('snamenode')-%> |
| # Secondary namenode checks |
| define service { |
| hostgroup_name snamenode |
| use hadoop-service |
| service_description NAMENODE::Secondary NameNode process down |
| servicegroups HDFS |
| check_command check_tcp!50090!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('namenode')-%> |
| # HDFS Checks |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description NAMENODE::NameNode Web UI down |
| servicegroups HDFS |
| check_command check_webui!namenode |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description NAMENODE::NameNode edit logs directory status |
| servicegroups HDFS |
| check_command check_name_dir_status!50070 |
| normal_check_interval 0.5 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description NAMENODE::NameNode host CPU utilization |
| servicegroups HDFS |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description NAMENODE::NameNode process down |
| servicegroups HDFS |
| check_command check_tcp!8020!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description HDFS::Corrupt/Missing blocks |
| servicegroups HDFS |
| check_command check_hdfs_blocks!50070!0%!0% |
| normal_check_interval 2 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description HDFS::HDFS capacity utilization |
| servicegroups HDFS |
| check_command check_hdfs_capacity!50070!80%!90% |
| normal_check_interval 10 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| define service { |
| hostgroup_name namenode |
| use hadoop-service |
| service_description HDFS::NameNode RPC latency |
| servicegroups HDFS |
| check_command check_rpcq_latency!NameNode!50070!3000!5000 |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| <%end-%> |
| |
| # MAPREDUCE Checks |
| <%if scope.function_hdp_nagios_members_exist('jobtracker')-%> |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker Web UI down |
| servicegroups MAPREDUCE |
| check_command check_webui!jobtracker |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobHistory Web UI down |
| servicegroups MAPREDUCE |
| check_command check_webui!jobhistory |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker CPU utilization |
| servicegroups MAPREDUCE |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker process down |
| servicegroups MAPREDUCE |
| check_command check_tcp!50030!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description MAPREDUCE::JobTracker RPC latency |
| servicegroups MAPREDUCE |
| check_command check_rpcq_latency!JobTracker!50030!3000!5000 |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| |
| # MAPREDUCE::TASKTRACKER Checks |
| define service { |
| hostgroup_name slaves |
| use hadoop-service |
| service_description TASKTRACKER::TaskTracker process down |
| servicegroups MAPREDUCE |
| check_command check_tcp!50060!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('slaves')-%> |
| # HDFS::DATANODE Checks |
| define service { |
| hostgroup_name slaves |
| use hadoop-service |
| service_description DATANODE::DataNode process down |
| servicegroups HDFS |
| check_command check_tcp!<%=scope.function_hdp_template_var("dfs_datanode_address")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name slaves |
| use hadoop-service |
| service_description DATANODE::DataNode storage full |
| servicegroups HDFS |
| check_command check_datanode_storage!<%=scope.function_hdp_template_var("dfs_datanode_http_address")%>!90%!90% |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 2 |
| } |
| |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> |
| # ZOOKEEPER Checks |
| define service { |
| hostgroup_name zookeeper-servers |
| use hadoop-service |
| service_description ZOOKEEPER::ZooKeeper Server process down |
| servicegroups ZOOKEEPER |
| check_command check_tcp!2181!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hbasemaster')-%> |
| # HBASE::REGIONSERVER Checks |
| define service { |
| hostgroup_name region-servers |
| use hadoop-service |
| service_description REGIONSERVER::RegionServer process down |
| servicegroups HBASE |
| check_command check_tcp!60020!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| # HBASE:: MASTER Checks |
| define service { |
| hostgroup_name hbasemaster |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master Web UI down |
| servicegroups HBASE |
| check_command check_webui!hbase |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name hbasemaster |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master CPU utilization |
| servicegroups HBASE |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| define service { |
| hostgroup_name hbasemaster |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master process down |
| servicegroups HBASE |
| check_command check_tcp!60000!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hiveserver')-%> |
| # HIVE Metastore check |
| define service { |
| hostgroup_name hiveserver |
| use hadoop-service |
| service_description HIVE-METASTORE::Hive Metastore status check |
| servicegroups HIVE-METASTORE |
| <%if scope.function_hdp_template_var("security_enabled")-%> |
| check_command check_hive_metastore_status!9083!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%> |
| <%else-%> |
| check_command check_hive_metastore_status!9083!<%=scope.function_hdp_template_var("java64_home")%>!false |
| <%end-%> |
| normal_check_interval 0.5 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('oozie-server')-%> |
| # Oozie check |
| define service { |
| hostgroup_name oozie-server |
| use hadoop-service |
| service_description OOZIE::Oozie Server status check |
| servicegroups OOZIE |
| <%if scope.function_hdp_template_var("security_enabled")-%> |
| check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%> |
| <%else-%> |
| check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java64_home")%>!false |
| <%end-%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('webhcat-server')-%> |
| # WEBHCAT check |
| define service { |
| hostgroup_name webhcat-server |
| use hadoop-service |
| service_description WEBHCAT::WebHCat Server status check |
| servicegroups WEBHCAT |
| <%if scope.function_hdp_template_var("security_enabled")-%> |
| check_command check_templeton_status!50111!v1!true!<%=scope.function_hdp_template_var("keytab_path")%>/<%=scope.function_hdp_template_var("nagios_user")%>.headless.keytab!<%=scope.function_hdp_template_var("nagios_user")%> |
| <%else-%> |
| check_command check_templeton_status!50111!v1!false |
| <%end-%> |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |