#!/usr/bin/perl -w
###############################################################################
# $Id$
###############################################################################
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################

=head1 NAME

VCL::monitor_vcld - VCL management node daemon service monitoring utility

=head1 SYNOPSIS

 perl monitory_vcld.pl [OPTION]...

=head1 DESCRIPTION

 Usage: perl monitory_vcld.pl [OPTION]...

 Checks the VCL management node daemon service. Starts the service if it is not
 running. Restarts the service if number of seconds since the management node
 last checked into the VCL database is greater than the critical threashold.

   --service-name=NAME      name of the service to check (default: vcld)
   --warning-seconds=NUM    a notice is sent to the VCL system administrators if
                            the management node last checked into the VCL
                            database more than NUM seconds ago (default: 60)
   --critical-seconds=NUM   the service is restarted and a warning message is
                            sent to the VCL system administrators if the
                            management node last checked into the VCL database
                            more than NUM seconds ago (default: 180)

=cut

###############################################################################
package VCL::monitor_vcld;

# Specify the lib path using FindBin
use FindBin;
use lib "$FindBin::Bin/../lib";

# Specify the version of this module
our $VERSION = '2.5';

use strict;
use warnings;
use diagnostics;
no warnings 'redefine';

use English -no_match_vars;
use Getopt::Long;

###############################################################################

our $LOGFILE = '/var/log/monitor_vcld.log';
our $DAEMON_MODE = 0;

INIT {
	Getopt::Long::Configure('pass_through');
	my $options = {};
	GetOptions($options, 'help');
	help() if defined($options->{'help'});
}

#==============================================================================

use VCL::utils;
use VCL::Module;

#..............................................................................
# Get the command line options
my $options = {};
GetOptions($options, 'service-name=s');
GetOptions($options, 'warning-seconds=s');
GetOptions($options, 'critical-seconds=s');

# Set default option values if not specified on the command line
my $vcld_service_name = defined($options->{'service-name'}) ? $options->{'service-name'} : 'vcld';
my $lastcheckin_warning_seconds = defined($options->{'warning-seconds'}) ? $options->{'warning-seconds'} : 60;
my $lastcheckin_critical_seconds = defined($options->{'critical-seconds'}) ? $options->{'critical-seconds'} : 180;

# Verify explicit option values
if ($lastcheckin_warning_seconds !~ /^\d+$/) {
	print_warning("--warning-seconds argument is not an integer: $lastcheckin_warning_seconds");
	help();
}
elsif ($lastcheckin_critical_seconds !~ /^\d+$/) {
	print_warning("--critical-seconds argument is not an integer: $lastcheckin_critical_seconds");
	help();
}
elsif ($lastcheckin_warning_seconds > $lastcheckin_critical_seconds) {
	print_warning("--warning-seconds argument ($lastcheckin_warning_seconds) is not less than --critical-seconds argument ($lastcheckin_critical_seconds)");
	help();
}

#..............................................................................
# Create a management node OS object
my $mn_os_perl_package = 'VCL::Module::OS::Linux::ManagementNode';
my $mn_os = VCL::Module::create_object($mn_os_perl_package);
if (!$mn_os) {
	print_warning("failed to create management node OS object");
	exit 1;
}

# Set the object's own MN OS to itself
# This is needed because some places in Linux.pm use $self->mn_os
$mn_os->set_mn_os($mn_os);

my $management_node_name = $mn_os->data->get_management_node_short_name();

#..............................................................................

print_message("checking $vcld_service_name service on $management_node_name, last checkin thresholds, warning: $lastcheckin_warning_seconds seconds, critical: $lastcheckin_critical_seconds");

# Check if the vcld service exists
if (!$mn_os->service_exists($vcld_service_name)) {
	print_warning("$vcld_service_name service does not exist on $management_node_name");
	exit 1;
}

# Check if the vcld service is running
my $service_status = $mn_os->is_service_running($vcld_service_name);
if (!defined($service_status)) {
	print_critical("failed to determine if $vcld_service_name service is running on $management_node_name");
	exit 1;
}
elsif ($service_status) {
	print_message("$vcld_service_name service is running on $management_node_name");
}
else {
	print_warning("$vcld_service_name service is not running on $management_node_name");
	
	# Attempt to start the service
	if ($mn_os->start_service($vcld_service_name)) {
		print_message("started $vcld_service_name service on $management_node_name, waiting 30 seconds before checking if daemon is checking into database");
		
		# Wait for 30 seconds and then check last checkin time
		sleep_uninterrupted(30);
	}
	else {
		print_critical("failed to start $vcld_service_name service on $management_node_name");
		exit 1;
	}
}

# Service is running, check management node last checkin time
my $management_node_info = get_management_node_info();
if (!defined($management_node_info)) {
	print_critical("failed to retrieve management node info for $management_node_name");
	exit 1;
}

my $lastcheckin_timestamp = $management_node_info->{lastcheckin};
if (!defined($lastcheckin_timestamp)) {
	print_critical("failed to retrieve lastcheckin timestamp from management node info, 'lastcheckin' key was not found:\n" . format_data($management_node_info));
	exit 1;
}

my $current_epoch_seconds = convert_to_epoch_seconds();
my $current_timestamp = makedatestring();
my $lastcheckin_epoch_seconds = $management_node_info->{lastcheckin_epoch};
my $lastcheckin_seconds_ago = ($current_epoch_seconds - $lastcheckin_epoch_seconds);

# This message displays the timestamp information from the management node and the database
my $detailed_ts_message = <<"END_MESSAGE";
	Current Time = $current_timestamp
	Current epoch = $current_epoch_seconds
	Last Checkin Time = $lastcheckin_timestamp
	Last Checkin epoch = $lastcheckin_epoch_seconds
END_MESSAGE

if ($lastcheckin_seconds_ago < 0) {
	print_warning("$management_node_name last checkin time is in the future: $lastcheckin_timestamp($lastcheckin_epoch_seconds), exiting");
}
elsif ($lastcheckin_seconds_ago < $lastcheckin_warning_seconds) {
	print_message("$management_node_name last checked in $lastcheckin_seconds_ago seconds ago at $lastcheckin_timestamp($lastcheckin_epoch_seconds)");
}
elsif ($lastcheckin_seconds_ago >= $lastcheckin_critical_seconds) {
	my $critical_message = "critical threshold exceeded, $management_node_name last checked in $lastcheckin_seconds_ago seconds ago at $lastcheckin_timestamp($lastcheckin_epoch_seconds)";
	# Attempt to restart the vcld service
	if ($mn_os->restart_service($vcld_service_name)) {
		print_critical("$critical_message, $vcld_service_name service restarted");
		print_critical($detailed_ts_message);
	}
	else {
		print_critical("$critical_message, failed to restart $vcld_service_name service");
		print_critical($detailed_ts_message);
	}
}
else {
	print_critical("last checkin warning threshold exceeded, $management_node_name last checked in $lastcheckin_seconds_ago seconds ago at $lastcheckin_timestamp($lastcheckin_epoch_seconds)");
	print_critical($detailed_ts_message);
}

print_message('done');

exit 0;

#/////////////////////////////////////////////////////////////////////////////

=head2 get_message_prefix

 Parameters  : none
 Returns     : string
 Description : 

=cut

sub get_message_prefix {
	my $calling_line = (caller(1))[2];
	
	# 2017-04-08 06:20:03|13772|||vcld|monitor_vcld.pl:print_message|222|vcld service is running on imgr05
	return makedatestring() . "|$PID||||monitor_vcld.pl:main|$calling_line|";
}

#/////////////////////////////////////////////////////////////////////////////

=head2 print_message

 Parameters  : $message
 Returns     : 1
 Description : 

=cut

sub print_message {
	my ($message) = @_;
	print get_message_prefix() . "$message\n";
	VCL::utils::notify($ERRORS{'OK'}, 0, $message);
	return 1;
}

#/////////////////////////////////////////////////////////////////////////////

=head2 print_warning

 Parameters  : $message
 Returns     : 1
 Description : 

=cut

sub print_warning {
	my ($message) = @_;
	print get_message_prefix() . "WARNING: $message\n";
	VCL::utils::notify($ERRORS{'WARNING'}, 0, $message);
	return 1;
}

#/////////////////////////////////////////////////////////////////////////////

=head2 print_critical

 Parameters  : $message
 Returns     : 1
 Description : 

=cut

sub print_critical {
	my ($message) = @_;
	print get_message_prefix() . "CRITICAL: $message\n";
	VCL::utils::notify($ERRORS{'CRITICAL'}, 0, $message);
	return 1;
}

#/////////////////////////////////////////////////////////////////////////////

=head2 help

 Parameters  : none
 Returns     : exits
 Description : Displays a help message and exits.

=cut

sub help {
	
	print <<EOF;
Usage: perl monitory_vcld.pl [OPTION]...

Checks the VCL management node daemon service. Starts the service if it is not
running. Restarts the service if number of seconds since the management node
last checked into the VCL database is greater than the critical threashold.

  --service-name=NAME      name of the service to check
                          (default: vcld)
  --warning-seconds=NUM    a notice is sent to the VCL system administrators if
                           the management node last checked into the VCL
                           database more than NUM seconds ago
                           (default: 60 seconds)
  --critical-seconds=NUM   the service is restarted and a warning message is
                           sent to the VCL system administrators if the
                           management node last checked into the VCL database
                           more than NUM seconds ago
                           (default: 180 seconds)
   --conf=<path>           specify monitory_vcld.pl configuration file
                           (default: /etc/vcl/vcld.conf)
   --log=<path>            specify vcld log file
                           (default: /var/log/monitory_vcld.log)
   --verbose               generate verbose log output

EOF

	exit 1;
}

###############################################################################

1;
__END__

=head1 SEE ALSO

L<http://cwiki.apache.org/VCL/>

=cut
