| # |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # |
| |
| |
| # NAGIOS SERVER Check (status log update) |
| <%if scope.function_hdp_nagios_members_exist('nagios-server')-%> |
| define service { |
| name hadoop-service |
| use generic-service |
| notification_options w,u,c,r,f,s |
| first_notification_delay 0 |
| notification_interval 0 # Send the notification once |
| contact_groups admins |
| notifications_enabled 1 |
| event_handler_enabled 1 |
| register 0 |
| } |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description NAGIOS::Nagios status log freshness |
| servicegroups NAGIOS |
| check_command check_nagios!10!/var/nagios/status.dat!<%=scope.function_hdp_template_var("::hdp-nagios::server::config::nagios_lookup_daemon_str")%> |
| normal_check_interval 5 |
| retry_check_interval 0.5 |
| max_check_attempts 2 |
| } |
| |
| # NAGIOS SERVER HDFS Checks |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Percent DataNodes with space available |
| servicegroups HDFS |
| check_command check_aggregate!"DATANODE::DataNode space"!10%!30% |
| normal_check_interval 2 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Percent DataNodes live |
| servicegroups HDFS |
| check_command check_aggregate!"DATANODE::DataNode process"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_nagios_members_exist('namenode') && |
| (scope.function_hdp_get_major_stack_version([scope.function_hdp_template_var("stack_version")]) >= 2) && |
| (scope.function_hdp_template_var("::hdp::params::dfs_ha_enabled"))%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::NameNode HA Healthy |
| servicegroups HDFS |
| check_command check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!<%=scope.function_hdp_template_var("::hdp::namenode_port")%> |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 5 |
| } |
| <%end-%> |
| |
| # AMBARI AGENT Checks |
| <%scope.function_hdp_template_var("all_hosts").each_with_index do |hostname, index|-%> |
| define service { |
| host_name <%=hostname%> |
| use hadoop-service |
| service_description AMBARI::Ambari Agent process |
| servicegroups AMBARI |
| check_command check_tcp!<%=scope.function_hdp_template_var("::all_ping_ports")[index]%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| <%end-%> |
| |
| # NAGIOS SERVER ZOOKEEPER Checks |
| <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description ZOOKEEPER::Percent ZooKeeper Servers live |
| servicegroups ZOOKEEPER |
| check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process"!35%!70% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| # NAGIOS SERVER HBASE Checks |
| <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HBASE::Percent RegionServers live |
| servicegroups HBASE |
| check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%end-%> |
| |
| |
| |
| # GANGLIA SERVER Checks |
| <%if scope.function_hdp_nagios_members_exist('ganglia-server')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Server process |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for Slaves |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_slaves_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for NameNode |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_namenode_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| <%if scope.function_hdp_nagios_members_exist('jobtracker')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for JobTracker |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_jobtracker_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for HBase Master |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hbase_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('resourcemanager')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for ResourceManager |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_rm_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('historyserver2')-%> |
| define service { |
| hostgroup_name ganglia-server |
| use hadoop-service |
| service_description GANGLIA::Ganglia Monitor process for HistoryServer |
| servicegroups GANGLIA |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hs_port")%>!-w 1 -c 1 |
| normal_check_interval 0.25 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('snamenode')-%> |
| # Secondary namenode checks |
| define service { |
| hostgroup_name snamenode |
| use hadoop-service |
| service_description NAMENODE::Secondary NameNode process |
| servicegroups HDFS |
| check_command check_tcp!<%=scope.function_hdp_template_var("snamenode_port")%>!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('namenode')-%> |
| # HDFS Checks |
| <% @namenodes = scope.function_hdp_template_var("::hdp::params::namenode_host"); @namenodes.each do |namenode| -%> |
| |
| define service { |
| host_name <%= namenode %> |
| use hadoop-service |
| service_description NAMENODE::NameNode edit logs directory status on <%= namenode %> |
| servicegroups HDFS |
| check_command check_name_dir_status!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 0.5 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| host_name <%= namenode %> |
| use hadoop-service |
| service_description NAMENODE::NameNode host CPU utilization on <%= namenode %> |
| servicegroups HDFS |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| host_name <%= namenode %> |
| use hadoop-service |
| service_description NAMENODE::NameNode Web UI on <%= namenode %> |
| servicegroups HDFS |
| check_command check_webui!namenode!<%=scope.function_hdp_template_var("::hdp::namenode_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| host_name <%= namenode %> |
| use hadoop-service |
| service_description NAMENODE::NameNode process on <%= namenode %> |
| servicegroups HDFS |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::namenode_metadata_port")%>!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| host_name <%= namenode %> |
| use hadoop-service |
| service_description HDFS::NameNode RPC latency on <%= namenode %> |
| servicegroups HDFS |
| check_command check_rpcq_latency!NameNode!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!3000!5000!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| |
| <% end -%> |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Blocks health |
| servicegroups HDFS |
| check_command check_hdfs_blocks!$HOSTGROUPMEMBERS:namenode$!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!0%!0%!<%=scope.function_hdp_template_var("::hdp-nagios::params::nn_metrics_property")%>!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 2 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::HDFS capacity utilization |
| servicegroups HDFS |
| check_command check_hdfs_capacity!$HOSTGROUPMEMBERS:namenode$!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!80%!90%!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 10 |
| retry_check_interval 1 |
| max_check_attempts 1 |
| } |
| |
| <%end-%> |
| |
| # MAPREDUCE Checks |
| <%if scope.function_hdp_nagios_members_exist('jobtracker')-%> |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker Web UI |
| servicegroups MAPREDUCE |
| check_command check_webui!jobtracker!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::HistoryServer Web UI |
| servicegroups MAPREDUCE |
| check_command check_webui!jobhistory!<%=scope.function_hdp_template_var("::hdp::jobhistory_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker CPU utilization |
| servicegroups MAPREDUCE |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| use hadoop-service |
| service_description JOBTRACKER::JobTracker process |
| servicegroups MAPREDUCE |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%>!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| |
| define service { |
| hostgroup_name jobtracker |
| use hadoop-service |
| service_description MAPREDUCE::JobTracker RPC latency |
| servicegroups MAPREDUCE |
| check_command check_rpcq_latency!JobTracker!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%>!3000!5000!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('tasktracker-servers')-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description MAPREDUCE::Percent TaskTrackers live |
| servicegroups MAPREDUCE |
| check_command check_aggregate!"TASKTRACKER::TaskTracker process"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| # MAPREDUCE::TASKTRACKER Checks |
| define service { |
| hostgroup_name tasktracker-servers |
| use hadoop-service |
| service_description TASKTRACKER::TaskTracker process |
| servicegroups MAPREDUCE |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::tasktracker_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| # MAPREDUCE::TASKTRACKER Mapreduce local dir used space |
| define service { |
| hostgroup_name tasktracker-servers |
| use hadoop-service |
| service_description TASKTRACKER::MapReduce local dir space |
| servicegroups MAPREDUCE |
| check_command check_mapred_local_dir_used_space!<%=scope.function_hdp_default("::hdp::mapred-site/mapred.local.dir")%>!85% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| |
| <%end-%> |
| |
| |
| <%if scope.function_hdp_nagios_members_exist('resourcemanager')-%> |
| # YARN::RESOURCEMANAGER Checks |
| define service { |
| hostgroup_name resourcemanager |
| use hadoop-service |
| service_description RESOURCEMANAGER::ResourceManager Web UI |
| servicegroups YARN |
| check_command check_webui!resourcemanager!<%=scope.function_hdp_template_var("::hdp::rm_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name resourcemanager |
| use hadoop-service |
| service_description RESOURCEMANAGER::ResourceManager CPU utilization |
| servicegroups YARN |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| hostgroup_name resourcemanager |
| use hadoop-service |
| service_description RESOURCEMANAGER::ResourceManager RPC latency |
| servicegroups YARN |
| check_command check_rpcq_latency!ResourceManager!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!3000!5000!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| |
| define service { |
| hostgroup_name resourcemanager |
| use hadoop-service |
| service_description RESOURCEMANAGER::ResourceManager process |
| servicegroups YARN |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <% end %> |
| |
| <%if scope.function_hdp_nagios_members_exist('nodemanagers')-%> |
| # YARN::NODEMANAGER Checks |
| define service { |
| hostgroup_name nodemanagers |
| use hadoop-service |
| service_description NODEMANAGER::NodeManager process |
| servicegroups YARN |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::nm_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name nodemanagers |
| use hadoop-service |
| service_description NODEMANAGER::NodeManager health |
| servicegroups YARN |
| check_command check_nodemanager_health!<%=scope.function_hdp_template_var("::hdp::nm_port")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description NODEMANAGER::Percent NodeManagers live |
| servicegroups YARN |
| check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <% end %> |
| |
| <%if scope.function_hdp_nagios_members_exist('historyserver2')-%> |
| # MAPREDUCE::JOBHISTORY Checks |
| define service { |
| hostgroup_name historyserver2 |
| use hadoop-service |
| service_description JOBHISTORY::HistoryServer Web UI |
| servicegroups MAPREDUCE |
| check_command check_webui!historyserver2!<%=scope.function_hdp_template_var("::hdp::hs_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name historyserver2 |
| use hadoop-service |
| service_description JOBHISTORY::HistoryServer CPU utilization |
| servicegroups MAPREDUCE |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| |
| define service { |
| hostgroup_name historyserver2 |
| use hadoop-service |
| service_description JOBHISTORY::HistoryServer RPC latency |
| servicegroups MAPREDUCE |
| check_command check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("::hdp::hs_port")%>!3000!5000!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 5 |
| } |
| |
| define service { |
| hostgroup_name historyserver2 |
| use hadoop-service |
| service_description JOBHISTORY::HistoryServer process |
| servicegroups MAPREDUCE |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hs_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| <% end %> |
| |
| <%if scope.function_hdp_nagios_members_exist('journalnodes')-%> |
| # Journalnode checks |
| define service { |
| hostgroup_name journalnodes |
| use hadoop-service |
| service_description JOURNALNODE::JournalNode process |
| servicegroups HDFS |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::journalnode_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| <%if scope.function_hdp_template_var("::hdp::params::dfs_ha_enabled")-%> |
| define service { |
| hostgroup_name nagios-server |
| use hadoop-service |
| service_description HDFS::Percent JournalNodes live |
| servicegroups HDFS |
| check_command check_aggregate!"JOURNALNODE::JournalNode process"!33%!50% |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('slaves')-%> |
| # HDFS::DATANODE Checks |
| define service { |
| hostgroup_name slaves |
| use hadoop-service |
| service_description DATANODE::DataNode process |
| servicegroups HDFS |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| define service { |
| hostgroup_name slaves |
| use hadoop-service |
| service_description DATANODE::DataNode space |
| servicegroups HDFS |
| check_command check_datanode_storage!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!90%!90%!<%=scope.function_hdp_template_var("::hdp::params::hadoop_ssl_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("::hdp::params::kinit_path_local")%>!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%> |
| normal_check_interval 5 |
| retry_check_interval 1 |
| max_check_attempts 2 |
| } |
| |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('flume-servers')-%> |
| # FLUME Checks |
| define service { |
| hostgroup_name flume-servers |
| use hadoop-service |
| service_description FLUME::Flume Agent process |
| servicegroups FLUME |
| check_command check_tcp!<%=scope.function_hdp_template_var("flume_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| |
| <%if scope.function_hdp_nagios_members_exist('zookeeper-servers')-%> |
| # ZOOKEEPER Checks |
| define service { |
| hostgroup_name zookeeper-servers |
| use hadoop-service |
| service_description ZOOKEEPER::ZooKeeper Server process |
| servicegroups ZOOKEEPER |
| check_command check_tcp!<%=scope.function_hdp_template_var("::clientPort")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hbasemasters')-%> |
| # HBASE::REGIONSERVER Checks |
| define service { |
| hostgroup_name region-servers |
| use hadoop-service |
| service_description REGIONSERVER::RegionServer process |
| servicegroups HBASE |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_rs_port")%>!-w 1 -c 1 |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| |
| # HBASE:: MASTER Checks |
| define service { |
| hostgroup_name hbasemasters |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master Web UI |
| servicegroups HBASE |
| check_command check_webui!hbase!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <% if scope.function_hdp_template_var("hdp_os_type") != "suse"%> |
| define service { |
| hostgroup_name hbasemasters |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master CPU utilization |
| servicegroups HBASE |
| check_command check_cpu!200%!250% |
| normal_check_interval 5 |
| retry_check_interval 2 |
| max_check_attempts 5 |
| } |
| <% end %> |
| define service { |
| hostgroup_name hbasemasters |
| use hadoop-service |
| service_description HBASEMASTER::HBase Master process |
| servicegroups HBASE |
| check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%>!-w 1 -c 1 |
| normal_check_interval 0.5 |
| retry_check_interval 0.25 |
| max_check_attempts 4 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hiveserver')-%> |
| # HIVE Metastore check |
| define service { |
| hostgroup_name hiveserver |
| use hadoop-service |
| service_description HIVE-METASTORE::Hive Metastore status |
| servicegroups HIVE-METASTORE |
| <%if scope.function_hdp_template_var("::hdp::params::security_enabled")-%> |
| check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("::hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> |
| <%else-%> |
| check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("::hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!false |
| <%end-%> |
| normal_check_interval 0.5 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('oozie-server')-%> |
| # Oozie check |
| define service { |
| hostgroup_name oozie-server |
| use hadoop-service |
| service_description OOZIE::Oozie Server status |
| servicegroups OOZIE |
| <%if scope.function_hdp_template_var("::hdp::params::security_enabled")-%> |
| check_command check_oozie_status!<%=scope.function_hdp_template_var("::hdp::oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> |
| <%else-%> |
| check_command check_oozie_status!<%=scope.function_hdp_template_var("::hdp::oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!false |
| <%end-%> |
| normal_check_interval 1 |
| retry_check_interval 1 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| <%if scope.function_hdp_nagios_members_exist('webhcat-server')-%> |
| # WEBHCAT check |
| define service { |
| hostgroup_name webhcat-server |
| use hadoop-service |
| service_description WEBHCAT::WebHCat Server status |
| servicegroups WEBHCAT |
| <%if scope.function_hdp_template_var("::hdp::params::security_enabled")-%> |
| check_command check_templeton_status!<%=scope.function_hdp_template_var("::hdp::templeton_port")%>!v1!<%=scope.function_hdp_template_var("::hdp::params::security_enabled")%>!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> |
| <%else-%> |
| check_command check_templeton_status!<%=scope.function_hdp_template_var("::hdp::templeton_port")%>!v1!false |
| <%end-%> |
| normal_check_interval 1 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |
| <%if scope.function_hdp_nagios_members_exist('hue-server')-%> |
| define service { |
| hostgroup_name hue-server |
| use hadoop-service |
| service_description HUE::Hue Server status |
| servicegroups HUE |
| check_command check_hue_status |
| normal_check_interval 100 |
| retry_check_interval 0.5 |
| max_check_attempts 3 |
| } |
| <%end-%> |
| |