| #!/usr/bin/perl -w |
| ############################################################################### |
| # $Id$ |
| ############################################################################### |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| ############################################################################### |
| |
| =head1 NAME |
| |
| VCL::healthcheck |
| |
| =head1 SYNOPSIS |
| |
| use base qw(VCL::healthcheck); |
| |
| =head1 DESCRIPTION |
| |
| Needs to be written. |
| |
| =cut |
| |
| ############################################################################## |
| package VCL::healthcheck; |
| |
| # Specify the lib path using FindBin |
| use FindBin; |
| use lib "$FindBin::Bin/.."; |
| |
| # Configure inheritance |
| use base qw(); |
| |
| # Specify the version of this module |
| our $VERSION = '2.00'; |
| |
| # Specify the version of Perl to use |
| use 5.008000; |
| |
| use strict; |
| use warnings; |
| use diagnostics; |
| use English qw( -no_match_vars ); |
| |
| use VCL::utils; |
| use DBI; |
| |
| ############################################################################## |
| |
| =head1 OBJECT METHODS |
| |
| =cut |
| |
| #///////////////////////////////////////////////////////////////////////////// |
| |
| #----------GLOBALS-------------- |
| our $LOG = "/var/log/healthcheckvcl.log"; |
| our $MYDBH; |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 new |
| |
| Parameters : |
| Returns : |
| Description : |
| |
| =cut |
| |
| sub new { |
| my ($class, %input) = @_; |
| my $obj_ref = {%input,}; |
| bless $obj_ref, $class; # bless ref to said class |
| $obj_ref->_initialize(); # more work to do |
| return $obj_ref; |
| |
| } |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 _initialize |
| |
| Parameters : |
| Returns : |
| Description : |
| |
| =cut |
| |
| sub _initialize { |
| my ($info) = @_; |
| my ($mnid, $managementnodeid, $selh, @row, $rows, $mnresourceid, $resourceid); |
| my $date_time = convert_to_datetime; |
| |
| notify($ERRORS{'OK'}, $LOG, "########### healthcheck run $date_time #################"); |
| |
| $info->{"globalmsg"}->{"header"} = "STATUS SUMMARY of VCL nodes:\n\n"; |
| $info->{"logfile"} = $LOG; |
| |
| if ($info->{managementnode} = get_management_node_info()) { |
| notify($ERRORS{'OK'}, $LOG, "retrieved management node information from database"); |
| } |
| else { |
| notify($ERRORS{'CRITICAL'}, $LOG, "unable to retrieve management node information from database"); |
| exit; |
| } |
| |
| #2 Collect hash of computers I can control with data |
| if ($info->{computertable} = get_computers_controlled_by_MN(%{$info->{managementnode}})) { |
| notify($ERRORS{'OK'}, $LOG, "retrieved management node resource groups from database"); |
| } |
| else { |
| notify($ERRORS{'CRITICAL'}, $LOG, "unable to retrieve management node resource groups from database"); |
| exit; |
| } |
| |
| foreach my $cid (keys %{$info->{computertable}}) { |
| #notify($ERRORS{'OK'}, $LOGFILE, "computer_id= $info->{computertable}->{$cid}->{computer_id}"); |
| #get computer information |
| if ($info->{computertable}->{$cid} = get_computer_info($cid)) { |
| |
| } |
| else { |
| delete $info->{computertable}->{$cid}; |
| } |
| } ## end foreach my $cid (keys %{$info->{computertable}}) |
| |
| } ### end sub _initialize |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 process |
| |
| Parameters : object |
| Returns : |
| Description : |
| |
| =cut |
| |
| sub process { |
| my ($info, $powerdownstage) = @_; |
| notify($ERRORS{'OK'}, $LOG, "in processing routine"); |
| $info->{"globalmsg"}->{"body"} = "Summary of VCL node monitoring system:\n\n"; |
| |
| my $mn_hostname = $info->{managementnode}->{hostname}; |
| |
| if($powerdownstage =~ /^(available|all)$/){ |
| notify($ERRORS{'CRITICAL'}, $LOG, "ALERT: powerdown stage triggered,placing MN $mn_hostname in maintenance"); |
| if (set_managementnode_state($info->{managementnode}, "maintenance")) { |
| notify($ERRORS{'OK'}, $LOG, "Successfully set $mn_hostname into maintenance"); |
| } |
| else{ |
| notify($ERRORS{'WARNING'}, $LOG, "Failed to set $mn_hostname into maintenance"); |
| } |
| } |
| elsif($powerdownstage =~ /^restore/){ |
| notify($ERRORS{'CRITICAL'}, $LOG, "ALERT: Environment OK: restoring state of MN $mn_hostname in available"); |
| if (set_managementnode_state($info->{managementnode}, "available")) { |
| notify($ERRORS{'OK'}, $LOG, "Successfully set $mn_hostname into available"); |
| } |
| else{ |
| notify($ERRORS{'WARNING'}, $LOG, "Failed to set $mn_hostname into available"); |
| } |
| } |
| else{ |
| #proceed standard checks |
| } |
| |
| foreach my $cid (keys %{$info->{computertable}}) { |
| #set some local variables |
| my $comp_hostname = $info->{computertable}->{$cid}->{computer}->{hostname}; |
| my $comp_type = $info->{computertable}->{$cid}->{computer}->{type}; |
| my $comp_state = $info->{computertable}->{$cid}->{computer}->{state}->{name}; |
| my $provisioning_perl_package = $info->{computertable}->{$cid}->{computer}->{provisioning}->{module}->{perlpackage}; |
| my $last_check = $info->{computertable}->{$cid}->{computer}->{lastcheck}; |
| my $image_os_name = $info->{computertable}->{$cid}->{image}->{OS}->{name}; |
| my $comp_id = $cid; |
| |
| #next if ($comp_type eq "lab"); |
| #next if($comp_type eq "blade"); |
| #next if ($comp_type eq "virtualmachine"); |
| #need to pass some of the management node info to provisioing module node_status |
| $info->{computertable}->{$cid}->{"managementnode"} = $info->{managementnode}; |
| $info->{computertable}->{$cid}->{"logfile"} = $info->{logfile}; |
| |
| notify($ERRORS{'DEBUG'}, $LOG, "cid= $cid"); |
| notify($ERRORS{'DEBUG'}, $LOG, "comp_hostname= $comp_hostname"); |
| notify($ERRORS{'DEBUG'}, $LOG, "comp_type= $comp_type"); |
| notify($ERRORS{'DEBUG'}, $LOG, "comp_state= $comp_state"); |
| notify($ERRORS{'DEBUG'}, $LOG, "provisioning_perl_package= $provisioning_perl_package"); |
| notify($ERRORS{'DEBUG'}, $LOG, "image_os_name= $image_os_name"); |
| |
| my ($datestring, $node_status_string); |
| |
| # Collect current state of node - it could have changed since we started |
| if (my $comp_current_state = get_computer_current_state_name($cid)) { |
| $info->{computertable}->{$cid}->{computer}->{state}->{name} = $comp_current_state; |
| $comp_state = $comp_current_state; |
| } |
| else { |
| #could not get it, use existing data |
| notify($ERRORS{'OK'}, $LOG, "could not retrieve current computer state cid= $cid, using old data"); |
| } |
| |
| #check for powerdownstages |
| if($powerdownstage =~ /^(available|all)$/){ |
| $info->{computertable}->{$cid}->{"powerdownstage"} = $powerdownstage; |
| if(powerdown_event($info->{computertable}->{$cid})){ |
| notify($ERRORS{'OK'}, $LOG, "Successfully powered down $comp_hostname"); |
| } |
| else { |
| #notify($ERRORS{'OK'}, $LOG, "Could not powerdown $comp_hostname"); |
| } |
| next; |
| } |
| else { |
| #proceed as normal |
| } |
| |
| #Only preform actions on these available or failed computer states |
| #skip if is inuse, maintenance, tovmhost, etc. |
| if ($comp_state !~ /available|failed/) { |
| |
| notify($ERRORS{'OK'}, $LOG, "NODE $comp_hostname $comp_state skipping"); |
| $info->{computers}->{$cid}->{"skip"} = 1; |
| $info->{"computersskipped"} += 1; |
| next; |
| } |
| |
| #check lastcheck |
| if (defined($last_check) && $comp_state !~ /failed/) { |
| my $lastcheckepoch = convert_to_epoch_seconds($last_check); |
| my $currentimeepoch = convert_to_epoch_seconds(); |
| my $delta = ($currentimeepoch - $lastcheckepoch); |
| |
| my $delta_minutes = round($delta / 60); |
| |
| if ($delta_minutes <= (60)) { |
| notify($ERRORS{'OK'}, $LOG, "NODE $comp_hostname recently checked $delta_minutes minutes ago skipping"); |
| #this node was recently checked |
| $info->{computers}->{$cid}->{"skip"} = 1; |
| $info->{"computersskipped"} += 1; |
| next; |
| } |
| $info->{"computerschecked"} += 1; |
| } ## end if (defined($last_check) && $comp_state !~... |
| |
| #count the nodes processed |
| $info->{"computercount"} += 1; |
| eval "use $provisioning_perl_package"; |
| if ($EVAL_ERROR) { |
| notify($ERRORS{'WARNING'}, $LOG, "$provisioning_perl_package module could not be loaded"); |
| notify($ERRORS{'OK'}, $LOG, "returning 0"); |
| return 0; |
| } |
| |
| my $node_status = eval "&$provisioning_perl_package" . '::node_status($info->{computertable}->{$cid});'; |
| if (!$EVAL_ERROR) { |
| notify($ERRORS{'OK'}, $LOG, "loaded $provisioning_perl_package"); |
| } |
| else { |
| notify($ERRORS{'WARNING'}, $LOG, "$provisioning_perl_package module could not be loaded $@"); |
| } |
| |
| if (defined $node_status->{status}) { |
| $node_status_string = $node_status->{status}; |
| notify($ERRORS{'DEBUG'}, $LOG, "node_status hash reference contains key {status}=$node_status_string"); |
| } |
| else { |
| notify($ERRORS{'DEBUG'}, $LOG, "node_status hash reference does not contain a key called 'status'"); |
| } |
| |
| if ($node_status_string =~ /^ready/i) { |
| #proceed |
| notify($ERRORS{'OK'}, $LOG, "nodestatus reports $node_status_string for $comp_hostname"); |
| |
| #update lastcheck datetime |
| $datestring = makedatestring; |
| if (update_computer_lastcheck($comp_id, $datestring, $LOG)) { |
| notify($ERRORS{'OK'}, $LOG, "updated lastcheckin for $comp_hostname"); |
| } |
| |
| #udpate state to available if old state is failed |
| if ($comp_state =~ /failed/i) { |
| if (update_computer_state($comp_id, "available", $LOG)) { |
| notify($ERRORS{'OK'}, $LOG, "updated state to available for $comp_hostname"); |
| } |
| } |
| } ## end if ($node_status_string =~ /^ready/i) |
| elsif ($node_status_string =~ /^reload/i) { |
| |
| $info->{computertable}->{$cid}->{node_status} = \%{$node_status}; |
| |
| notify($ERRORS{'OK'}, $LOG, "nodestatus reports $node_status_string for $comp_hostname"); |
| |
| #additional steps |
| my $node_available = 0; |
| |
| if ($comp_type eq "lab") { |
| #no additional checks required for lab type |
| #if(lab_investigator($info->{computertable}->{$cid})){ |
| # $node_available =1; |
| #} |
| } |
| elsif ($comp_type eq "virtualmachine") { |
| if (_virtualmachine_investigator($info->{computertable}->{$cid})) { |
| $node_available = 1; |
| } |
| } |
| elsif ($comp_type eq "blade") { |
| if (_blade_investigator($info->{computertable}->{$cid})) { |
| $node_available = 1; |
| } |
| } |
| |
| if ($node_available) { |
| #update state to available |
| if (update_computer_state($comp_id, "available", $LOG)) { |
| notify($ERRORS{'OK'}, $LOG, "updated state to available for $comp_hostname"); |
| } |
| #update lastcheck datetime |
| $datestring = makedatestring; |
| if (update_computer_lastcheck($comp_id, $datestring, $LOG)) { |
| notify($ERRORS{'OK'}, $LOG, "updated lastcheckin for $comp_hostname"); |
| } |
| } ## end if ($node_available) |
| else{ |
| $info->{globalmsg}->{failedbody} .= "$comp_hostname type= $comp_type offline\n"; |
| } |
| |
| } ## end elsif ($node_status_string =~ /^reload/i) [ if ($node_status_string =~ /^ready/i) |
| else { |
| notify($ERRORS{'OK'}, $LOG, "node_status reports unknown value for $comp_hostname node_status_string= $node_status_string "); |
| |
| } |
| |
| |
| if ($info->{computers}->{$cid}->{skip}) { |
| #update lastcheck time |
| $datestring = makedatestring; |
| } |
| |
| } #for loop |
| return 1; |
| } ## end sub process |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 blade_investigator |
| |
| Parameters : hash |
| Returns : 1,0 |
| Description : provides additional checks for blade types |
| |
| =cut |
| sub _blade_investigator { |
| my ($self) = @_; |
| |
| my $retval = 0; |
| my $comp_hostname = $self->{computer}->{hostname}; |
| my $comp_imagename = $self->{imagerevision}->{imagename}; |
| my $comp_id = $self->{computer}->{id}; |
| my $nodestatus_status = $self->{node_status}->{status}; |
| my $nodestatus_nodetype = $self->{node_status}->{nodetype}; |
| my $nodestatus_currentimage = $self->{node_status}->{currentimage}; |
| my $nodestatus_ping = $self->{node_status}->{ping}; |
| my $nodestatus_rpower = $self->{node_status}->{rpower}; |
| my $nodestatus_nodeset = $self->{node_status}->{nodeset}; |
| my $nodestatus_ssh = $self->{node_status}->{ssh}; |
| |
| notify($ERRORS{'OK'}, $LOG, "comp_hostname= $comp_hostname node_status_status= $nodestatus_status"); |
| |
| #If can ping and can ssh into it, compare loaded image with database imagename |
| if ($nodestatus_ping && $nodestatus_ssh) { |
| if (_image_revision_check($comp_id, $comp_imagename, $nodestatus_currentimage)) { |
| #return success |
| notify($ERRORS{'OK'}, $LOG, "comp_hostname= $comp_hostname imagename updated"); |
| $retval = 1; |
| } |
| } |
| else { |
| notify($ERRORS{'OK'}, $LOG, "comp_hostname= $comp_hostname is confirmed down"); |
| } |
| |
| return $retval; |
| |
| } ## end sub _blade_investigator |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 powerdown_event |
| |
| Parameters : hash |
| Returns : 1,0 |
| Description : |
| |
| =cut |
| |
| sub powerdown_event { |
| my ($self) = @_; |
| |
| my $management_node_keys = $self->{managementnode}->{keys}; |
| my $computer_host_name = $self->{computer}->{hostname}; |
| my $computer_short_name = 0; |
| my $computer_ip_address = $self->{computer}->{IPaddress}; |
| my $image_os_name = $self->{image}->{OS}->{name}; |
| my $image_name = $self->{imagerevision}->{imagename}; |
| my $image_os_type = $self->{image}->{OS}->{type}; |
| my $provisioning_perl_package = $self->{computer}->{provisioning}->{module}->{perlpackage}; |
| my $comp_type = $self->{computer}->{type}; |
| my $comp_state = $self->{computer}->{state}->{name}; |
| my $computer_node_name = $self->{computer}->{hostname}; |
| my $power_down_stage = $self->{powerdownstage}; |
| |
| $computer_short_name = $1 if ($computer_node_name =~ /([-_a-zA-Z0-9]*)(\.?)/); |
| |
| #If blade or vm and available|failed|maintenance - simply power-off |
| #If blade and vmhostinuse - check vms, if available power-down all |
| |
| if(($comp_type =~ /blade/) && ($comp_state =~ /^(available|failed|maintenance)/)){ |
| notify($ERRORS{'OK'}, $LOG, "calling provision module $provisioning_perl_package power_off routine $computer_short_name"); |
| |
| eval "use $provisioning_perl_package"; |
| if ($EVAL_ERROR) { |
| notify($ERRORS{'WARNING'}, $LOG, "$provisioning_perl_package module could not be loaded"); |
| notify($ERRORS{'OK'}, $LOG, "returning 0"); |
| return 0; |
| } |
| my $power_off_status = eval "&$provisioning_perl_package" . '::power_off($computer_short_name);'; |
| notify($ERRORS{'OK'}, $LOG, "$power_off_status "); |
| if($power_off_status){ |
| notify($ERRORS{'OK'}, $LOG, "SUCCESS powered_off $computer_short_name"); |
| return 1; |
| } |
| return 0; |
| } |
| else{ |
| notify($ERRORS{'OK'}, $LOG, "SKIPPING $computer_short_name comp_type= $comp_type in comp_state= $comp_state"); |
| return 0; |
| } |
| |
| |
| |
| } |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 virtualmachine_investigator |
| |
| Parameters : hash |
| Returns : 1,0 |
| Description : provides additional checks for virtualmachine types |
| |
| =cut |
| |
| sub _virtualmachine_investigator { |
| my ($self) = @_; |
| |
| my $retval = 0; |
| my $comp_hostname = $self->{computer}->{hostname}; |
| my $comp_imagename = $self->{imagerevision}->{imagename}; |
| my $comp_id = $self->{computer}->{id}; |
| my $nodestatus_status = $self->{node_status}->{status}; |
| my $nodestatus_currentimage = $self->{node_status}->{currentimage}; |
| my $nodestatus_ping = $self->{node_status}->{ping}; |
| my $nodestatus_ssh = $self->{node_status}->{ssh}; |
| my $nodestatus_vmstate = $self->{node_status}->{vmstate}; |
| my $nodestatus_image_match = $self->{node_status}->{image_match}; |
| |
| if($nodestatus_vmstate =~ /off/){ |
| # Ok for node to be off |
| $retval =1; |
| return $retval; |
| } |
| |
| if ($nodestatus_currentimage && $nodestatus_ssh) { |
| if (_image_revision_check($comp_id, $comp_imagename, $nodestatus_currentimage)) { |
| #return success |
| notify($ERRORS{'OK'}, $LOG, "comp_hostname= $comp_hostname imagename updated"); |
| $retval = 1; |
| } |
| } |
| else { |
| notify($ERRORS{'OK'}, $LOG, "comp_hostname= $comp_hostname is confirmed down nodestatus_vmstate= $nodestatus_vmstate nodestatus_ssh= $nodestatus_ssh"); |
| } |
| |
| return $retval; |
| } ## end sub _virtualmachine_investigator |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 _image_revision_check |
| |
| Parameters : hash |
| Returns : 1,0 |
| Description : compare the input values, if no difference or success |
| updated return 1, if can not update return 0 |
| provides additional checks for virtualmachine types |
| |
| =cut |
| |
| sub _image_revision_check { |
| |
| my ($comp_id, $comp_imagename, $nodestatus_currentimage) = @_; |
| |
| my $retval = 1; |
| #Return retval=1 only if update_computer_imagename fails |
| if ($comp_imagename !~ /$nodestatus_currentimage/) { |
| #update computer entry |
| if (update_computer_imagename($comp_id, $nodestatus_currentimage, $LOG)) { |
| $retval = 1; |
| } |
| else { |
| #failed to update computer image info |
| notify($ERRORS{'OK'}, $LOG, "update_computer_imagename return 0"); |
| $retval = 0; |
| } |
| } ## end if ($comp_imagename !~ /$nodestatus_currentimage/) |
| |
| return $retval; |
| |
| } ## end sub _image_revision_check |
| |
| #//////////////////////////////////////////////////////////////////////////////// |
| |
| =head2 send_report |
| |
| Parameters : hash |
| Returns : 1,0 |
| Description : |
| |
| =cut |
| |
| sub send_report { |
| my ($hck) = @_; |
| |
| my $sysadmin_email = $ENV{management_node_info}{SYSADMIN_EMAIL}; |
| |
| #notify($ERRORS{'OK'},$LOG,"$hck->{globalmsg}->{body}\n\n $hck->{globalmsg}->{failedbody}\n"); |
| if (defined($hck->{computercount})) { |
| $hck->{globalmsg}->{body} .= "Number of nodes found for this management node $hck->{MN}: $hck->{computercount}\n"; |
| } |
| if (defined($hck->{"computerschecked"})) { |
| $hck->{globalmsg}->{body} .= "Number of nodes checked: $hck->{computerschecked}\n"; |
| } |
| if (defined($hck->{"computersskipped"})) { |
| $hck->{globalmsg}->{body} .= "Number of nodes skipped due to recent check: $hck->{computersskipped}\n"; |
| } |
| if (defined($hck->{labnodesfailed})) { |
| $hck->{globalmsg}->{body} .= "UNavailable labnodes: $hck->{labnodesfailed}\n"; |
| } |
| if (defined($hck->{labnodesavailable})) { |
| $hck->{globalmsg}->{body} .= "Available labnodes: $hck->{labnodesavailable}\n"; |
| } |
| |
| if (defined($hck->{globalmsg}->{correctedbody})) { |
| $hck->{globalmsg}->{body} .= "\nCorrected VCL nodes:\n\n$hck->{globalmsg}->{correctedbody}\n"; |
| } |
| if (defined($hck->{globalmsg}->{failedbody})) { |
| $hck->{"globalmsg"}->{body} .= "\nProblem VCL nodes:\n\n$hck->{globalmsg}->{failedbody}\n"; |
| |
| } |
| if (!defined($hck->{globalmsg}->{failedbody}) && !defined($hck->{globalmsg}->{correctedbody})) { |
| $hck->{globalmsg}->{body} .= "\nAll nodes report ok"; |
| |
| } |
| mail($sysadmin_email, "VCL node monitoring report", "$hck->{globalmsg}->{body}"); |
| } ## end sub send_report |
| |
| #///////////////////////////////////////////////////////////////////////////// |
| |
| 1; |
| __END__ |
| |
| =head1 SEE ALSO |
| |
| L<http://cwiki.apache.org/VCL/> |
| |
| =cut |