#!/usr/local/groundwork/perl/bin/perl -w --
# nagios2collage_socket.pl
# Copyright (c) 2004-2011 GroundWork Open Source, Inc.
# www.groundworkopensource.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
use strict;
use Time::Local;
use vars qw($socket $smart_update);
use IO::Socket;
use Time::HiRes;
use DBI;
use CollageQuery;
use Data::Dumper;
use GDMA::GDMAUtils;
use TypedConfig;
####################################################################
# Configuration Parameters
####################################################################
my $default_config_file = '/usr/local/groundwork/config/status-feeder.properties';
# 0 => minimal, 1 => summary, 2 => basic, 3 => XML messages, 4 => debug level, 5 => ridiculous level.
my $debug_level = undef;
my $logfile = undef; # Where the log file is to be written.
my $log_as_utf8 = 0; # Set to 0 to log Foundation messages as ISO-8859-1, to 1 to log as UTF-8.
my $thisnagios = undef; # Identifier for this instance of Nagios; should generally be `hostname -s`.
my $nagios_version = undef; # Major version only (e.g., 3).
my $statusfile = undef; # Absolute pathname of the Nagios status file.
my $cycle_sleep_time = undef; # Wait time in seconds between checks of the Nagios status.log file.
# Time between full updates to the local Foundation, in seconds. This is the longest you want to wait for updates
# to the LastCheckTime in Foundation. Set this to a longer time on busy systems. Suggested 90 second minimum,
# 300 second maximum. The longer the time, the larger the bundles of updates. Setting this too long could result
# in a "bumpy" performance curve, as the system processes large bundles. Old advice: If you set this near the
# maximum, you might also want to also increase the max_xml_bundle_size below.
my $local_full_update_time = undef;
my $smart_update = undef; # If set to 1, then send only state changes and heartbeats.
my $send_on_host_data_change = undef;
my $send_on_host_timing_change = undef;
my $send_on_service_data_change = undef;
my $send_on_service_timing_change = undef;
my $send_sync_warning = undef; # Send a console message when Nagios and Foundation are out of sync. 0 = no warning, 1 = warning.
my $send_events_for_pending_to_ok = undef; # Whether to send pending-to-ok transition events, or just skip them.
my $failure_sleep_time = undef; # Seconds to sleep before restarting after failure, to prevent tight looping.
my $foundation_host = undef; # Where to send results to Foundation.
my $foundation_port = undef; # Where to send results to Foundation.
my $xml_bundle_size = undef; # Typical number of messages to send in each bundle. This is NOT the minimum size ...
my $max_xml_bundle_size = undef; # ... but this is the maximum size. 150 seems to work reasonably well in testing.
my $sync_timeout_seconds = undef; # Soft limit on time for which accumulating messages are held before sending.
# This is the actual SO_SNDBUF value, as set by setsockopt(). This is therefore the actual size of
# the data buffer available for writing, irrespective of additional kernel bookkeeping overhead.
# This will have no effect without the companion as-yet-undocumented patch to IO::Socket::INET.
# Set this to 0 to use the system default socket send buffer size. A typical value to set here is
# 262144. (Note that the value specified here is likely to be limited to something like 131071 by
# the sysctl net.core.wmem_max parameter.)
my $send_buffer_size = undef;
# Socket timeout (in seconds), to address GWMON-7407. Typical value is 60. Set to 0 to disable.
#
# This timeout is here only for use in emergencies, when Foundation has completely frozen up and is no
# longer reading (will never read) a socket we have open. We don't want to set this value so low that
# it will interfere with normal communication, even given the fact that Foundation may wait a rather
# long time between sips from this straw as it processes a large bundle of messages that we sent it, or
# is otherwise busy and just cannot get back around to reading the socket in a reasonably short time.
my $socket_send_timeout = undef;
# Maximum number of events to accumulate before sending them all as a bundle.
my $max_event_bundle_size = undef;
# $syncwait is a multiplier of $cycle_sleep_time to wait on updates while Foundation processes a
# sync. Typical value is 20. In theory, you might need to increase this if you see deadlocks after
# commit in the framework.log file. In practice, though, the need for this should have completely
# disappeared now that we have proper synchronization with pre-flight and commit operations in place.
my $syncwait = undef;
# Options for sending state data to parent/standby server(s)
my $send_state_changes_by_nsca = undef; # Whether to send state changes and heartbeats via direct NSCA (requires primary_parent).
# Valid hostname or IP address, if $send_state_changes_by_nsca is true.
my $primary_parent = undef;
my $send_to_secondary_NSCA = undef; # 0 => do not send to secondary, 1 => send, in which case you must define secondary_parent.
# Valid hostname or IP address, if $send_state_changes_by_nsca and $send_to_secondary_NSCA are true.
my $secondary_parent = undef;
# Seconds between NSCA heartbeats (approximate; will be at least this, possibly this + $remote_full_update_time).
my $nsca_heartbeat_interval = undef;
my $nsca_full_dump_interval = undef; # Seconds between NSCA full dumps (approximate). Set to zero to disable, if desired.
my $nsca_port = undef; # Port the parent (and secondary parent) is listening on (normally 5667).
my $max_messages_per_send_nsca = undef; # Limit to the size of batched NSCA sends, to avoid overloads (typical value 100).
my $nsca_batch_delay = undef; # Sleep this many seconds between sending batches of $max_messages_per_send_nsca results
my $nsca_timeout = undef; # Give up on sending a heartbeat if we get no answer from parent after this long.
# Options for sending state data via the GDMA spooler:
my $send_state_changes_by_gdma = undef; # Whether to send state changes and heartbeats via the GDMA spooler.
my $gdma_heartbeat_interval = undef; # Seconds between GDMA heartbeats (approximate).
my $gdma_full_dump_interval = undef; # Seconds between GDMA full dumps (approximate). Set to zero to disable, if desired.
# Absolute path to the base of the GDMA software installation (typically, "/usr/local/groundwork/gdma").
# This will be used to locate the spool file the status feeder will write into.
my $gdma_install_base = undef;
my $max_unspooled_results_to_save = undef; # How many unspooled GDMA results to save for another attempt to spool them.
####################################################################
# Working Variables
####################################################################
# Derived flags, for easy testing.
my $debug_summary = undef;
my $debug_basic = undef;
my $debug_xml = undef;
my $debug_debug = undef;
my $debug_ridiculous = undef;
my @non_default_host_data_change = ();
my @non_default_host_timing_change = ();
my @non_default_service_data_change = ();
my @non_default_service_timing_change = ();
my %allowed_host_data_fields = (
CheckType => 1,
isFailurePredictionEnabled => 1,
isHostFlapping => 1,
isObsessOverHost => 1,
isProcessPerformanceData => 1
);
my %allowed_host_timing_fields = (
TimeDown => 1,
TimeUnreachable => 1,
TimeUp => 1
);
my %allowed_service_data_fields = (
CheckType => 1,
LastHardState => 1,
isFailurePredictionEnabled => 1,
isObsessOverService => 1,
isProcessPerformanceData => 1,
isServiceFlapping => 1
);
my %allowed_service_timing_fields = (
TimeCritical => 1,
TimeOK => 1,
TimeUnknown => 1,
TimeWarning => 1
);
my $heartbeat_mode = 0; # Do not change this setting -- it is controlled by smart_update.
my $last_nsca_heartbeat_time = undef;
my $last_nsca_full_dump_time = undef;
my $last_gdma_heartbeat_time = undef;
my $last_gdma_full_dump_time = undef;
my $heartbeat_high_water_mark = 100; # initial size for arrays holding heartbeat states; will be adjusted upward
my $state_change_high_water_mark = 100; # initial size for arrays holding object state changes; will be adjusted upward
my $next_sync_timeout = 0; # used for XML batching
my $message_counter = 1;
my $last_statusfile_mtime = 0;
my $element_ref = {};
my $global_nagios = {};
my $collage_status_ref = {};
my $device_ref = {};
my $host_ref = {};
my $service_ref = {};
my $loop_count = 0;
my $total_wait = 0;
my @xml_messages = ();
my @event_messages = ();
my $n_hostcount = 0;
my $n_servicecount = 0;
my $last_n_hostcount = 0;
my $last_n_servicecount = 0;
my $f_hostcount = 0;
my $f_servicecount = 0;
my $last_f_hostcount = 0;
my $last_f_servicecount = 0;
my $enable_feeding = 1;
my $syncwaitcount = 0;
my $logtime = '';
my $sync_at_start = 0;
my $looping_start_time = 0;
my $gdma_spool_filename = undef;
my $gdma_results_to_spool = [];
# These mappings must reflect the corresponding Nagios internal enumerations,
# so we can correctly interpret data from the status file.
# from nagios.h: HOST_UP, HOST_DOWN, HOST_UNREACHABLE
my %HostStatus = ( 0 => 'UP', 1 => 'DOWN', 2 => 'UNREACHABLE' );
# from nagios.h: STATE_OK, STATE_WARNING, STATE_CRITICAL, STATE_UNKNOWN
my %ServiceStatus = ( 0 => 'OK', 1 => 'WARNING', 2 => 'CRITICAL', 3 => 'UNKNOWN' );
# from common.h: HOST_CHECK_ACTIVE and SERVICE_CHECK_ACTIVE, HOST_CHECK_PASSIVE and SERVICE_CHECK_PASSIVE
my %CheckType = ( 0 => 'ACTIVE', 1 => 'PASSIVE' );
# from common.h: SOFT_STATE, HARD_STATE
my %StateType = ( 0 => 'SOFT', 1 => 'HARD' );
my %hostipaddress = ();
my $start_message =
"";
my $command_close = '';
my $restart_xml = '';
my $no_xml = '';
our $shutdown_requested = 0;
use constant ERROR_STATUS => 0;
use constant STOP_STATUS => 1;
use constant RESTART_STATUS => 2;
use constant CONTINUE_STATUS => 3;
####################################################################
# Program
####################################################################
# Here is the entire substance of this script, in a one-liner:
exit ((main() == ERROR_STATUS) ? 1 : 0);
####################################################################
# Supporting Subroutines
####################################################################
sub read_config_file {
my $config_file = shift;
eval {
my $config = TypedConfig->new ($config_file);
$debug_level = $config->get_number('debug_level');
$logfile = $config->get_scalar('logfile');
$thisnagios = $config->get_scalar('thisnagios');
$nagios_version = $config->get_number('nagios_version');
$statusfile = $config->get_scalar('statusfile');
$cycle_sleep_time = $config->get_number('cycle_sleep_time');
$local_full_update_time = $config->get_number('local_full_update_time');
$smart_update = $config->get_boolean('smart_update');
$send_on_host_data_change = $config->get_scalar('send_on_host_data_change');
$send_on_host_timing_change = $config->get_scalar('send_on_host_timing_change');
$send_on_service_data_change = $config->get_scalar('send_on_service_data_change');
$send_on_service_timing_change = $config->get_scalar('send_on_service_timing_change');
$send_sync_warning = $config->get_boolean('send_sync_warning');
$send_events_for_pending_to_ok = $config->get_boolean('send_events_for_pending_to_ok');
$failure_sleep_time = $config->get_number('failure_sleep_time');
$foundation_host = $config->get_scalar('foundation_host');
$foundation_port = $config->get_number('foundation_port');
$xml_bundle_size = $config->get_number('xml_bundle_size');
$max_xml_bundle_size = $config->get_number('max_xml_bundle_size');
$sync_timeout_seconds = $config->get_number('sync_timeout_seconds');
$send_buffer_size = $config->get_number('send_buffer_size');
$socket_send_timeout = $config->get_number('socket_send_timeout');
$max_event_bundle_size = $config->get_number('max_event_bundle_size');
$syncwait = $config->get_number('syncwait');
$send_state_changes_by_nsca = $config->get_boolean('send_state_changes_by_nsca');
$primary_parent = $config->get_scalar('primary_parent');
$send_to_secondary_NSCA = $config->get_boolean('send_to_secondary_NSCA');
$secondary_parent = $config->get_scalar('secondary_parent');
$nsca_heartbeat_interval = $config->get_number('nsca_heartbeat_interval');
$nsca_full_dump_interval = $config->get_number('nsca_full_dump_interval');
$nsca_port = $config->get_number('nsca_port');
$max_messages_per_send_nsca = $config->get_number('max_messages_per_send_nsca');
$nsca_batch_delay = $config->get_number('nsca_batch_delay');
$nsca_timeout = $config->get_number('nsca_timeout');
$send_state_changes_by_gdma = $config->get_boolean('send_state_changes_by_gdma');
$gdma_heartbeat_interval = $config->get_number('gdma_heartbeat_interval');
$gdma_full_dump_interval = $config->get_number('gdma_full_dump_interval');
$gdma_install_base = $config->get_scalar('gdma_install_base');
$max_unspooled_results_to_save = $config->get_number('max_unspooled_results_to_save');
# FIX LATER: range-validate many of the values we obtained from the config file
if ($send_state_changes_by_nsca) {
if ($primary_parent eq '') {
die "primary_parent must be non-empty if send_state_changes_by_nsca is true\n";
}
if ($send_to_secondary_NSCA && $secondary_parent eq '') {
die "secondary_parent must be non-empty if send_state_changes_by_nsca and send_to_secondary_NSCA are true\n";
}
if ($max_messages_per_send_nsca < 1) {
die "max_messages_per_send_nsca must be positive if send_state_changes_by_nsca is true\n";
}
}
if ($send_state_changes_by_gdma) {
if ($gdma_install_base eq '') {
die "gdma_install_base must be non-empty if send_state_changes_by_gdma is true\n";
}
if (!-d $gdma_install_base) {
die "gdma_install_base must be an existing directory if send_state_changes_by_gdma is true\n";
}
# Set up the spoolfile path based on the platform we are running on.
$gdma_spool_filename = GDMAUtils::get_spool_filename($gdma_install_base);
if ($max_unspooled_results_to_save < 0) {
die "max_unspooled_results_to_save cannot be negative\n";
}
}
if ($send_on_host_data_change ne '') {
@non_default_host_data_change = split(' ', $send_on_host_data_change);
foreach my $field (@non_default_host_data_change) {
if (not $allowed_host_data_fields{$field}){
die "send_on_host_data_change contains unknown field \"$field\"\n";
}
}
}
if ($send_on_host_timing_change ne '') {
@non_default_host_timing_change = split(' ', $send_on_host_timing_change);
foreach my $field (@non_default_host_timing_change) {
if (not $allowed_host_timing_fields{$field}){
die "send_on_host_timing_change contains unknown field \"$field\"\n";
}
}
}
if ($send_on_service_data_change ne '') {
@non_default_service_data_change = split(' ', $send_on_service_data_change);
foreach my $field (@non_default_service_data_change) {
if (not $allowed_service_data_fields{$field}){
die "send_on_service_data_change contains unknown field \"$field\"\n";
}
}
}
if ($send_on_service_timing_change ne '') {
@non_default_service_timing_change = split(' ', $send_on_service_timing_change);
foreach my $field (@non_default_service_timing_change) {
if (not $allowed_service_timing_fields{$field}){
die "send_on_service_timing_change contains unknown field \"$field\"\n";
}
}
}
$debug_summary = $debug_level >= 1;
$debug_basic = $debug_level >= 2;
$debug_xml = $debug_level >= 3;
$debug_debug = $debug_level >= 4;
$debug_ridiculous = $debug_level >= 5;
};
if ($@) {
chomp $@;
$@ =~ s/^ERROR:\s+//i;
die "Error: Cannot read config file $config_file ($@)\n";
}
}
sub freeze_logtime {
$logtime = '[' . ( scalar localtime ) . '] ';
}
sub time_text {
my $timestamp = shift;
if ( $timestamp <= 0 ) {
return '0';
}
else {
my ( $seconds, $minutes, $hours, $day_of_month, $month, $year, $wday, $yday, $isdst ) = localtime($timestamp);
return sprintf '%02d-%02d-%02d %02d:%02d:%02d', $year + 1900, $month + 1, $day_of_month, $hours, $minutes, $seconds;
}
}
sub log_message {
print LOG @_, "\n";
}
sub log_timed_message {
freeze_logtime();
print LOG $logtime, @_, "\n";
}
sub log_shutdown {
log_timed_message "=== Shutdown requested; terminating (process $$). ===";
}
sub log_socket_problem {
my $type = $_[0];
log_timed_message "Trouble $type socket: $!";
}
sub main {
# If a "once" argument was passed on the command line, just run once to synchronize state between Nagios and Foundation.
$sync_at_start = $ARGV[0] || 0;
read_config_file ($default_config_file);
if ( !open( LOG, '>>', $logfile ) ) {
print "Cannot open the logfile $logfile ($!); aborting!\n";
## FIX MINOR: follow the perf-data script model to record an error and send a summary log message to Foundation
return ERROR_STATUS;
}
LOG->autoflush(1);
log_timed_message "=== Starting up (process $$). ===";
# Set up to handle broken pipe errors. This has to be done in conjunction with later code that
# will cleanly process an EPIPE return code from a socket write.
#
# Our trivial signal handler turns SIGPIPE signals generated when we write to sockets already
# closed by the server into EPIPE errors returned from the write operations. The same would
# happen if instead we just ignored these signals, but with this mechanism we also automatically
# impose a short delay (inside the signal handler) when this situation occurs -- there is little
# reason to keep pounding the server when it has already indicated it cannot accept data just now.
$SIG{'PIPE'} = \&sig_pipe_handler;
chomp $thisnagios;
my $daemon_status = synchronized_daemon();
close LOG;
return $daemon_status;
}
sub synchronized_daemon {
my $commit_lock;
my $errors;
# We catch SIGTERM, SIGINT, and SIGQUIT so we can stop when Nagios stops, or when we are asked nicely.
local $SIG{INT} = \&handle_exit_signal;
local $SIG{QUIT} = \&handle_exit_signal;
local $SIG{TERM} = \&handle_exit_signal;
use MonarchLocks;
if ( !Locks->wait_for_file_to_disappear( $Locks::in_progress_file, \&log_timed_message, \$shutdown_requested ) ) {
log_shutdown();
return STOP_STATUS;
}
while (1) {
$errors = Locks->open_and_lock( \*commit_lock, $Locks::commit_lock_file, $Locks::SHARED, $Locks::NON_BLOCKING );
last if !@$errors;
for (@$errors) {
log_message($_);
}
sleep 30;
if ($shutdown_requested) {
log_shutdown();
return STOP_STATUS;
}
}
my ( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, @rest ) = stat( \*commit_lock );
my $initial_mtime = $mtime;
my $init_status = initialize_feeder();
Locks->close_and_unlock( \*commit_lock );
if ($init_status != CONTINUE_STATUS) {
log_timed_message("=== Initialization failed; will exit (process $$). ===");
return $init_status;
}
while (1) {
if ($shutdown_requested) {
flush_pending_output();
log_shutdown();
return STOP_STATUS;
}
if ( !Locks->wait_for_file_to_disappear( $Locks::in_progress_file, \&log_timed_message, \$shutdown_requested ) ) {
flush_pending_output();
log_shutdown();
return STOP_STATUS;
}
while (1) {
$errors = Locks->open_and_lock( \*commit_lock, $Locks::commit_lock_file, $Locks::SHARED, $Locks::NON_BLOCKING );
last if !@$errors;
for (@$errors) {
log_message($_);
}
sleep 30;
if ($shutdown_requested) {
flush_pending_output();
log_shutdown();
return STOP_STATUS;
}
}
( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, @rest ) = stat( \*commit_lock );
if ( $mtime != $initial_mtime ) {
Locks->close_and_unlock( \*commit_lock );
flush_pending_output();
log_timed_message("=== A commit has occurred; will exit to start over and re-initialize (process $$). ===");
return RESTART_STATUS;
}
my $cycle_status = perform_feeder_cycle_actions();
Locks->close_and_unlock( \*commit_lock );
if ($cycle_status != CONTINUE_STATUS) {
flush_pending_output();
log_timed_message("=== Cycle status is not to continue; will exit (process $$). ===");
return $cycle_status;
}
if ($shutdown_requested) {
flush_pending_output();
log_shutdown();
return STOP_STATUS;
}
# Sleep until the next cycle boundary.
sleep $cycle_sleep_time;
}
}
sub flush_pending_output {
if (@xml_messages) {
## Note that $message_counter may well be -1 at this point.
$message_counter = output_bundle_to_socket( \@xml_messages, $message_counter );
@xml_messages = ();
}
## Note that $message_counter may well be -1 at this point.
$message_counter = send_pending_events( $message_counter, 1 );
}
# This signal handler is for ordinary use, during code that can be expected to check the
# $shutdown_requested flag fairly often.
sub handle_exit_signal {
my $signame = shift;
$shutdown_requested = 1;
# for developer debugging only
# log_timed_message "ERROR: Received SIG$signame; aborting!";
}
# This signal handler is to be potentially installed as an alternate signal handler only around
# code that might run for a long time without checking the $shutdown_requested flag. DBI calls
# often fall into this category; the C code within the DBI library might simply resume its action
# after seeing an EINTR, and not return to Perl so we can recognize the interrupt. (DBD::mysql
# does not implement the $sth->cancel() operation, so that is not an option; see the DBI
# documentation about this.) If you do use this, whatever cleanup activities you would
# ordinarily run before final process exit won't be run, so keep that in mind in the design of
# the overall script algorithm.
#
# Unfortunately, actual testing under heavy disk load shows that even running this short signal
# handler that exits from within its own context is not good enough to kill the script quickly
# upon receipt of a termination signal. So instead we just revert to the usual system default
# behavior for such signals, allowing them to terminate the process directly.
sub die_upon_exit_signal {
my $signame = shift;
log_timed_message "NOTICE: Received SIG$signame; exiting!";
log_shutdown();
exit (1);
}
sub sig_pipe_handler {
sleep 2;
}
sub initialize_feeder {
## Pre-extend the event_messages array for later efficiency, then truncate back to an empty state.
$#event_messages = $max_event_bundle_size;
@event_messages = ();
my $failed = 1;
if ( my $socket = IO::Socket::INET->new( PeerAddr => $foundation_host, PeerPort => $foundation_port, Proto => 'tcp', Type => SOCK_STREAM ) ) {
$socket->autoflush();
log_timed_message 'Start message local port: ', $socket->sockport() if $debug_summary;
$failed = 0;
unless ( $socket->sockopt(SO_SNDTIMEO, pack('L!L!', $socket_send_timeout, 0)) ) {
log_socket_problem ('setting send timeout on');
$failed = 1;
}
unless ($failed) {
log_timed_message 'Writing start message to Foundation.' if $debug_summary;
unless ( $socket->print ($start_message) ) {
log_socket_problem ('writing to');
$failed = 1;
}
else {
LOG->print ($start_message, "\n") if $debug_xml;
}
}
unless ($failed) {
log_timed_message 'Writing close message to Foundation.' if $debug_summary;
unless ( $socket->print ($command_close) ) {
log_socket_problem ('writing to');
$failed = 1;
}
else {
LOG->print ($command_close, "\n") if $debug_xml;
}
}
unless ( close($socket) ) {
log_socket_problem ('closing');
$failed = 1;
}
}
if ($failed) {
log_timed_message "Listener services not available. Retrying in $failure_sleep_time seconds.";
sleep $failure_sleep_time;
return RESTART_STATUS;
}
my $init_start_time = Time::HiRes::time();
log_timed_message 'loading cached addresses ...';
load_cached_addresses() or return ERROR_STATUS;
log_timed_message 'loading global nagios parameters ...';
$global_nagios = get_globals( $statusfile );
if ( !defined($global_nagios) ) {
return RESTART_STATUS;
}
log_timed_message 'loading initial state ...';
my $ref = getInitialState($collage_status_ref);
if ($shutdown_requested) {
log_shutdown();
return STOP_STATUS;
}
if ( !defined($ref) ) {
return RESTART_STATUS;
}
# Startup message to parent - send sync
if ( $send_state_changes_by_nsca || $send_state_changes_by_gdma ) {
my $full_dump = assemble_remote_full_dump($collage_status_ref);
my $last_full_dump_time = Time::HiRes::time();
if ($send_state_changes_by_nsca) {
send_nsca( $primary_parent, $nsca_port, $nsca_timeout, $send_to_secondary_NSCA, $secondary_parent,
$max_messages_per_send_nsca, $nsca_batch_delay, $full_dump );
$last_nsca_full_dump_time = $last_full_dump_time;
}
if ($send_state_changes_by_gdma) {
gdma_spool($gdma_results_to_spool, $full_dump);
$last_gdma_full_dump_time = $last_full_dump_time;
}
}
if ($shutdown_requested) {
log_shutdown();
return STOP_STATUS;
}
if ($debug_summary) {
my $init_time = sprintf '%0.4F', ( Time::HiRes::time() - $init_start_time );
freeze_logtime();
print "Startup init time=$init_time seconds.\n";
print LOG "${logtime}Startup init time=$init_time seconds.\n";
}
if ( $debug_ridiculous ) {
freeze_logtime();
print LOG $logtime, Data::Dumper->Dump( [ \%{$collage_status_ref} ], [qw(\%{collage_status_ref})] );
}
$total_wait = 0;
$n_hostcount = 0;
$n_servicecount = 0;
$next_sync_timeout = time + $sync_timeout_seconds;
$looping_start_time = Time::HiRes::time();
log_timed_message 'starting main loop ...';
return CONTINUE_STATUS;
}
sub perform_feeder_cycle_actions {
my $start_time = Time::HiRes::time();
if ( $debug_summary ) {
log_timed_message 'Starting cycle.';
}
$total_wait += $cycle_sleep_time;
# Don't bother with this loop iteration if the input data hasn't changed since last time.
my $statusfile_mtime = (stat($statusfile))[9];
if ( !defined $statusfile_mtime ) {
freeze_logtime();
print "Warning: stat of file $statusfile failed: $!\n";
print LOG "${logtime}Warning: stat of file $statusfile failed: $!\n";
sleep $failure_sleep_time;
return ERROR_STATUS;
}
elsif ($statusfile_mtime <= $last_statusfile_mtime) {
print LOG "Skipping cycle -- $statusfile has not changed.\n";
}
else {
$last_statusfile_mtime = $statusfile_mtime;
if ( $total_wait >= $local_full_update_time ) {
$total_wait = 0;
if ($smart_update) {
## Time to send heartbeat. That is, time to update LastUpdateTime stamps.
$heartbeat_mode = 1;
print LOG "Heartbeat in progress this cycle ...\n" if $debug_summary;
}
}
# Check count of hosts and services in Nagios and Foundation.
# Note: Unlike in getInitialState(), the calls to CollageQuery here are purposely not
# set up to die immediately should a termination signal be received while the queries are
# running. That's for two reasons. One, the calls we use here are simple "select count(*)"
# queries that we don't expect to run terribly long. And two, in this part of the logic,
# we want to allow the caller an opportunity to clean up and flush any pending data before
# we exit the process.
my $foundation;
eval {
$foundation = CollageQuery->new();
};
if ($@) {
chomp $@;
print LOG $@, "\n";
return ERROR_STATUS;
}
$f_hostcount = $foundation->getHostCount('NAGIOS');
print LOG "Foundation Host Count: $f_hostcount\n" if $debug_basic;
$f_servicecount = $foundation->getServiceCount('NAGIOS');
print LOG "Foundation Service Count: $f_servicecount\n" if $debug_basic;
# Get the status and counts from Nagios
$element_ref = get_status( $statusfile, $nagios_version );
if ($shutdown_requested) {
log_shutdown();
return STOP_STATUS;
}
if ( !defined($element_ref) ) {
return RESTART_STATUS;
}
print LOG "Nagios Host Count: $n_hostcount\n" if $debug_basic;
print LOG "Nagios Service Count: $n_servicecount\n" if $debug_basic;
if ( $loop_count == 0 ) { # first loop will not have last counts
$last_f_hostcount = $f_hostcount;
$last_f_servicecount = $f_servicecount;
$last_n_hostcount = $n_hostcount;
$last_n_servicecount = $n_servicecount;
}
# Now we can compare counts and see if Nagios and Foundation are in sync
if ( ( $f_hostcount ne $n_hostcount ) or ( $f_servicecount ne $n_servicecount ) ) {
# Hold off on updates for a bit, because Nagios and Foundation are not synced. With the proper
# synchronization code for this script now in play, this should never happen. We keep this code
# around mainly to generate the out-of-sync message in case somehow the unexpected happens.
if ( $syncwaitcount >= $syncwait ) {
# Tell the log about the differences that caused the sync errors
if ( $debug_summary ) {
my $deltas = find_deltas( $element_ref, $collage_status_ref );
if ( $f_hostcount ne $n_hostcount ) {
print LOG "Found $f_hostcount hosts in Foundation and $n_hostcount hosts in Nagios.\n";
}
if ( $f_servicecount ne $n_servicecount ) {
print LOG "Found $f_servicecount services in Foundation and $n_servicecount services in Nagios.\n";
}
if ( $debug_basic ) {
print LOG "Hosts and/or services in Foundation and not in Nagios:\n";
print LOG Data::Dumper->Dump( [ \%{ $deltas->{FoundationHost} } ], [qw(Foundation)] );
print LOG "Hosts and/or services in Nagios and not in Foundation:\n";
print LOG Data::Dumper->Dump( [ \%{ $deltas->{NagiosHost} } ], [qw(Nagios)] );
}
}
$enable_feeding = 1;
$syncwaitcount = 0;
if ( $loop_count != 0 ) {
log_timed_message 'Out of sync for too long!! Please try commit again. Resuming feeding.';
if ($send_sync_warning) {
unless ( my $socket = IO::Socket::INET->new(
PeerAddr => $foundation_host, PeerPort => $foundation_port, Proto => 'tcp', Type => SOCK_STREAM
) ) {
log_timed_message 'Listener services not available.';
# We don't "return RESTART_STATUS;" here or on subsequent socket failures because
# the message we're about to submit is just advisory, and the opportunity to submit
# the same message will probably appear again in a later processing cycle.
}
else {
$socket->autoflush();
log_timed_message 'Out-of-sync message local port: ', $socket->sockport() if $debug_summary;
unless ( $socket->sockopt(SO_SNDTIMEO, pack('L!L!', $socket_send_timeout, 0)) ) {
log_socket_problem ('setting send timeout on');
}
else {
unless ( $socket->print (
""
) ) {
log_timed_message 'Writing log message to Foundation.' if $debug_summary;
log_socket_problem ('writing to');
}
}
unless ( close($socket) ) {
log_socket_problem ('closing');
}
# Re-query Foundation
$f_hostcount = $foundation->getHostCount('NAGIOS');
$f_servicecount = $foundation->getServiceCount('NAGIOS');
}
}
}
}
else {
$enable_feeding = 0;
$syncwaitcount++;
my $cycles_left = $syncwait - $syncwaitcount;
log_timed_message "Out of sync detected! Waiting on updates for up to $cycles_left more cycles ...";
}
}
else {
if ( ( $last_f_hostcount ne $f_hostcount )
or ( $last_f_servicecount ne $f_servicecount )
or ( $last_n_hostcount ne $n_hostcount )
or ( $last_n_servicecount ne $n_servicecount ) )
{
## Case 1: Changed, but in sync. We missed the sync, so just re-start.
log_timed_message 'Changed, but in sync (missed sync). Restarting.';
return RESTART_STATUS;
}
}
# Now reset the counts for next time
$last_f_hostcount = $f_hostcount;
$last_f_servicecount = $f_servicecount;
$last_n_hostcount = $n_hostcount;
$last_n_servicecount = $n_servicecount;
$n_hostcount = 0;
$n_servicecount = 0;
if ( $element_ref && $enable_feeding ) {
my $host_updates_ref;
my $serv_updates_ref;
if ($heartbeat_mode) {
$global_nagios = get_globals( $statusfile );
if ( !defined($global_nagios) ) {
return RESTART_STATUS;
}
}
my $state_changes = ( $send_state_changes_by_nsca || $send_state_changes_by_gdma ) ? [] : undef;
$host_updates_ref = build_host_xml( $thisnagios, $element_ref, $collage_status_ref, $state_changes );
return RESTART_STATUS if not defined $host_updates_ref;
$serv_updates_ref = build_service_xml( $thisnagios, $element_ref, $collage_status_ref, $state_changes );
return RESTART_STATUS if not defined $serv_updates_ref;
push( @xml_messages, @{$host_updates_ref} );
push( @xml_messages, @{$serv_updates_ref} );
if ( defined($state_changes) && @$state_changes ) {
if ($send_state_changes_by_nsca) {
send_nsca( $primary_parent, $nsca_port, $nsca_timeout, $send_to_secondary_NSCA, $secondary_parent,
$max_messages_per_send_nsca, $nsca_batch_delay, $state_changes );
}
if ($send_state_changes_by_gdma) {
gdma_spool($gdma_results_to_spool, $state_changes);
}
}
if ( @xml_messages >= $xml_bundle_size || ( @xml_messages > 0 && time >= $next_sync_timeout ) ) {
$message_counter = output_bundle_to_socket( \@xml_messages, $message_counter );
return RESTART_STATUS if ($message_counter < 0);
@xml_messages = ();
$next_sync_timeout = time + $sync_timeout_seconds;
$loop_count++;
if ($debug_summary) {
my $loop_time = sprintf '%0.4F', Time::HiRes::time() - $start_time;
my $avg_loop_time = sprintf '%0.4F',
( Time::HiRes::time() - $looping_start_time - ( ( $loop_count - 1 ) * $cycle_sleep_time ) ) / $loop_count;
freeze_logtime();
print "Loops Completed = $loop_count. Last loop time=$loop_time seconds. Avg loop time=$avg_loop_time seconds.\n";
print LOG "${logtime}Loops Completed = $loop_count. Last loop time=$loop_time seconds. Avg loop time=$avg_loop_time seconds.\n";
}
}
}
# quit after just one run -- legacy, now used only for development testing
if ( $enable_feeding && $sync_at_start =~ /once/ ) {
log_timed_message 'Exiting after one cycle, per command option.';
return STOP_STATUS;
}
# re-enable feeding ...
$enable_feeding = 1;
$heartbeat_mode = 0;
}
# Send any pending state transitions left in the buffer
$message_counter = send_pending_events( $message_counter, 1 );
my $now = Time::HiRes::time();
my $send_nsca_full_dump = $send_state_changes_by_nsca && ($nsca_full_dump_interval > 0) &&
(($now - $last_nsca_full_dump_time) > $nsca_full_dump_interval);
my $send_gdma_full_dump = $send_state_changes_by_gdma && ($gdma_full_dump_interval > 0) &&
(($now - $last_gdma_full_dump_time) > $gdma_full_dump_interval);
if (@$gdma_results_to_spool) {
gdma_spool($gdma_results_to_spool, []);
}
if ( $send_nsca_full_dump || $send_gdma_full_dump ) {
my $full_dump = assemble_remote_full_dump($collage_status_ref);
if ($send_nsca_full_dump) {
send_nsca( $primary_parent, $nsca_port, $nsca_timeout, $send_to_secondary_NSCA, $secondary_parent,
$max_messages_per_send_nsca, $nsca_batch_delay, $full_dump );
$last_nsca_full_dump_time = $now;
}
if ($send_gdma_full_dump) {
gdma_spool($gdma_results_to_spool, $full_dump);
$last_gdma_full_dump_time = $now;
}
}
return ($message_counter < 0) ? RESTART_STATUS : CONTINUE_STATUS;
}
sub load_cached_addresses() {
# Get hosts->IPaddress from Monarch
my ( $dbname, $dbhost, $dbuser, $dbpass, $dbtype ) = CollageQuery::readGroundworkDBConfig('monarch');
my $dsn = '';
if ( defined($dbtype) && $dbtype eq 'postgresql' ) {
$dsn = "DBI:Pg:dbname=$dbname;host=$dbhost";
}
else {
$dsn = "DBI:mysql:database=$dbname;host=$dbhost";
}
my $dbh = DBI->connect( $dsn, $dbuser, $dbpass, { 'AutoCommit' => 1 } );
if ( !$dbh ) {
log_message "Can't connect to database $dbname. Error: ", $DBI::errstr;
return 0;
}
my $query = 'select name, address from hosts;';
my $sth = $dbh->prepare($query);
if ( !$sth->execute() ) {
log_message $sth->errstr;
$sth->finish();
$dbh->disconnect();
return 0;
}
my @serviceprofile_ids = ();
while ( my $row = $sth->fetchrow_hashref() ) {
$hostipaddress{ $$row{name} } = $$row{address};
}
$sth->finish();
$dbh->disconnect();
return 1;
}
sub getInitialState {
## Check each host and service status in Foundation, and populate collage_status_ref
## with current state. Do this at startup to avoid huge initial message loads.
my $collage_status_ref = shift;
# In this routine, we set up to die instantly if certain database calls are interrupted by a signal.
# The $foundation->getHostServices() call in particular can take a considerable amount of time, but
# some of its internal database-access components (DBD::mysql) are effectively not interruptible by
# signals (the EINTR return code from some internal system call is recognized and the interrupted
# system call is restarted, instead of having some means to check a cancel-is-requested flag and
# stop the request). This script is instrumented to effectively return as quickly as signals are
# recognized by Perl, but that might be far too long for outside applications to wait for the death
# of this script once it has been signaled to terminate, especially on a very busy system (typically,
# one where the available disk i/o is saturated). Fortunately, we know by code inspection that there
# are no resources that need flushing or cleaning up before we exit here.
my $foundation;
eval {
# local $SIG{INT} = \&die_upon_exit_signal;
# local $SIG{QUIT} = \&die_upon_exit_signal;
# local $SIG{TERM} = \&die_upon_exit_signal;
local $SIG{INT} = 'DEFAULT';
local $SIG{QUIT} = 'DEFAULT';
local $SIG{TERM} = 'DEFAULT';
$foundation = CollageQuery->new();
};
if ($@) {
chomp $@;
print LOG $@, "\n";
return undef;
}
log_timed_message '... getting Nagios status ...';
my $element_ref = get_status( $statusfile, $nagios_version );
if ($shutdown_requested) {
return undef;
}
if ( !defined($element_ref) ) {
return undef;
}
if ( $debug_ridiculous ) {
freeze_logtime();
print LOG $logtime, Data::Dumper->Dump( [ \%{$element_ref} ], [qw(\%element_ref)] );
}
log_timed_message '... getting hosts ...';
my $fn_hosts = undef;
eval {
# local $SIG{INT} = \&die_upon_exit_signal;
# local $SIG{QUIT} = \&die_upon_exit_signal;
# local $SIG{TERM} = \&die_upon_exit_signal;
local $SIG{INT} = 'DEFAULT';
local $SIG{QUIT} = 'DEFAULT';
local $SIG{TERM} = 'DEFAULT';
$fn_hosts = $foundation->getHosts();
};
if ($@) {
chomp $@;
log_timed_message "Error in getHosts: $@";
return undef;
}
if ($shutdown_requested) {
return undef;
}
log_timed_message '... getting host services ...';
my $fn_host_services = undef;
eval {
# local $SIG{INT} = \&die_upon_exit_signal;
# local $SIG{QUIT} = \&die_upon_exit_signal;
# local $SIG{TERM} = \&die_upon_exit_signal;
local $SIG{INT} = 'DEFAULT';
local $SIG{QUIT} = 'DEFAULT';
local $SIG{TERM} = 'DEFAULT';
$fn_host_services = $foundation->getHostServices();
};
if ($@) {
chomp $@;
log_timed_message "Error in getHostServices: $@";
return undef;
}
if ($shutdown_requested) {
return undef;
}
log_timed_message '... processing host/service state ...';
if ( ref($fn_hosts) eq 'HASH' ) {
foreach my $host ( keys %{$fn_hosts} ) {
my $fn_host = $fn_hosts->{$host};
my $cs_host = \%{ $collage_status_ref->{Host}->{$host} };
my $el_host = $element_ref->{Host}->{$host};
if ( $debug_debug ) {
print LOG Data::Dumper->Dump( [ $fn_host ], [qw($fn_host)] );
print LOG "Nagios last check time: $el_host->{LastCheckTime}\n";
print LOG "Nagios next check time: $el_host->{NextCheckTime}\n";
}
# Look for hosts that have never been checked -- don't bother sending results if so.
if ( $el_host->{LastCheckTime} eq '0' && ( !defined $fn_host->{LastCheckTime} ) ) {
$cs_host->{LastCheckTime} = '0'; # This will show up as no change of state
}
else {
$cs_host->{LastCheckTime} = $fn_host->{LastCheckTime}; # Might be a change, might not
}
# Do the same for NexCheckTime in case it was never fed (like for passive checks)
if ( $el_host->{NextCheckTime} eq '0' && ( !defined $fn_host->{NextCheckTime} ) ) {
$cs_host->{NextCheckTime} = '0'; # This will show up as no change of state
}
else {
$cs_host->{NextCheckTime} = $fn_host->{NextCheckTime}; # Might be a change, might not
}
# Do the same for LastNotificationTime
if ( $el_host->{LastNotificationTime} eq '0' && ( !defined $fn_host->{LastNotificationTime} ) ) {
$cs_host->{LastNotificationTime} = '0'; # This will show up as no change of state
}
else {
$cs_host->{LastNotificationTime} = $fn_host->{LastNotificationTime}; # Might be a change, might not
}
$cs_host->{Comments} = $fn_host->{Comments};
$cs_host->{CurrentAttempt} = $fn_host->{CurrentAttempt};
$cs_host->{CurrentNotificationNumber} = $fn_host->{CurrentNotificationNumber};
$cs_host->{ExecutionTime} = $fn_host->{ExecutionTime};
$cs_host->{Latency} = $fn_host->{Latency};
$cs_host->{MaxAttempts} = $fn_host->{MaxAttempts};
$cs_host->{MonitorStatus} = $fn_host->{MonitorStatus};
$cs_host->{NextCheckTime} = $fn_host->{NextCheckTime};
$cs_host->{ScheduledDowntimeDepth} = $fn_host->{ScheduledDowntimeDepth};
$cs_host->{StateType} = $fn_host->{StateType};
$cs_host->{isAcknowledged} = $fn_host->{isAcknowledged};
$cs_host->{isChecksEnabled} = $fn_host->{isChecksEnabled};
$cs_host->{isEventHandlersEnabled} = $fn_host->{isEventHandlersEnabled};
$cs_host->{isFlapDetectionEnabled} = $fn_host->{isFlapDetectionEnabled};
# $cs_host->{isHostFlapping} = $fn_host->{isHostFlapping};
$cs_host->{isNotificationsEnabled} = $fn_host->{isNotificationsEnabled};
# $cs_host->{isObsessOverHost} = $fn_host->{isObsessOverHost};
$cs_host->{isPassiveChecksEnabled} = $fn_host->{isPassiveChecksEnabled};
$cs_host->{LastPluginOutput} = $fn_host->{LastPluginOutput};
$cs_host->{PercentStateChange} = $fn_host->{PercentStateChange};
$cs_host->{LastStateChange} = $fn_host->{LastStateChange};
# Look for fancy MonitorStatus values and translate to the simple ones Nagios knows
if ( $fn_host->{MonitorStatus} =~ /DOWN/ ) {
$cs_host->{MonitorStatus} = 'DOWN';
}
# FIX FUTURE: We ignore isObsessOverHost for now, as it is not needed in Foundation (yet).
# Similarly, we ignore isHostFlapping.
# isObsessOverHost (property)
# The isObsessOverHost flag is perhaps problematic. The obsess_over_host flag can be set in Nagios
# for an individual host, but such settings can be globally overridden by the obsess_over_hosts flag
# at the Nagios level. So we need to override the host setting with the global if it's off ...
# if ( $global_nagios->{obsess_over_hosts} == 0 ) {
# $cs_host->{isObsessOverHost} = 0;
# }
# Separately, this property is not set in Foundation: GWMON-7678 filed to address this.
# Take out the following assignment when that issue is resolved:
# if ( !defined $cs_host->{isObsessOverHost} ) {
# $cs_host->{isObsessOverHost} = $el_host->{isObsessOverHost};
# }
####
if ( !defined $cs_host->{Comments} ) {
$cs_host->{Comments} = ' ';
}
if ( ref($fn_host_services) eq 'HASH' ) {
foreach my $service ( keys %{ $fn_host_services->{$host} } ) {
my $fn_svc = $fn_host_services->{$host}->{$service};
my $cs_svc = \%{ $cs_host->{Service}->{$service} };
my $el_svc = $el_host->{Service}->{$service};
if ( $debug_debug ) {
print LOG Data::Dumper->Dump( [ $fn_svc ], [qw($fn_svc)] );
}
my $f_state = $fn_svc->{MonitorStatus};
my $n_state = $el_svc->{MonitorStatus};
# $fn_svc->{LastCheckTime}; This does not exist -- must use the Check Time from the current status log ...
$cs_svc->{LastCheckTime} = $el_svc->{LastCheckTime};
# $fn_svc->{LastNotificationTime}; This might not be defined, so if 0 in nagios, don't generate a difference.
if ( $el_svc->{LastNotificationTime} eq '0' && ( !defined $fn_svc->{LastNotificationTime} ) ) {
$cs_svc->{LastNotificationTime} = '0'; # This will show up as no change of state
} else {
$cs_svc->{LastNotificationTime} = $fn_svc->{LastNotificationTime}; # Might be a change, might not
}
if ( !defined $cs_svc->{Comments} ) {
$cs_svc->{Comments} = ' ';
}
$cs_svc->{MonitorStatus} = $fn_svc->{MonitorStatus};
$cs_svc->{CurrentAttempt} = $fn_svc->{CurrentAttempt};
$cs_svc->{CurrentNotificationNumber} = $fn_svc->{CurrentNotificationNumber};
$cs_svc->{MaxAttempts} = $fn_svc->{MaxAttempts};
$cs_svc->{NextCheckTime} = $fn_svc->{NextCheckTime};
$cs_svc->{ScheduledDowntimeDepth} = $fn_svc->{ScheduledDowntimeDepth};
$cs_svc->{isAcceptPassiveChecks} = $fn_svc->{isAcceptPassiveChecks};
$cs_svc->{isChecksEnabled} = $fn_svc->{isChecksEnabled};
$cs_svc->{isEventHandlersEnabled} = $fn_svc->{isEventHandlersEnabled};
$cs_svc->{isFlapDetectionEnabled} = $fn_svc->{isFlapDetectionEnabled};
$cs_svc->{isNotificationsEnabled} = $fn_svc->{isNotificationsEnabled};
# $cs_svc->{isObsessOverService} = $fn_svc->{isObsessOverService};
$cs_svc->{isProblemAcknowledged} = $fn_svc->{isProblemAcknowledged};
# $cs_svc->{isServiceFlapping} = $fn_svc->{isServiceFlapping};
$cs_svc->{LastPluginOutput} = $fn_svc->{LastPluginOutput};
$cs_svc->{PercentStateChange} = $fn_svc->{PercentStateChange};
$cs_svc->{Latency} = $fn_svc->{Latency};
$cs_svc->{ExecutionTime} = $fn_svc->{ExecutionTime};
$cs_svc->{LastStateChange} = $fn_svc->{LastStateChange};
$cs_svc->{StateType} = $fn_svc->{StateType};
# Look for fancy MonitorStatus values and translate to the simple ones Nagios knows
if ( $fn_svc->{MonitorStatus} =~ /CRITICAL/ ) {
$cs_svc->{MonitorStatus} = 'CRITICAL';
} elsif ( $fn_svc->{MonitorStatus} =~ /WARNING/ ) {
$cs_svc->{MonitorStatus} = 'WARNING';
}
}
}
}
}
return $collage_status_ref;
}
sub open_socket {
my $socket = undef;
my $failed = 1;
# FIX FUTURE: Here and for all the other sockets in this script, we want to implement a
# connect timeout, possibly by using the new() Timeout parameter. But the documentation
# is terribly ambiguous about the actual effect of that setting, so careful testing is
# required to verify that it would have the desired effect.
#
# SendBuf is an as-yet-undocumented patch to IO::Socket::INET.
my @socket_args = ( PeerAddr => $foundation_host, PeerPort => $foundation_port, Proto => 'tcp', Type => SOCK_STREAM );
push @socket_args, ( SendBuf => $send_buffer_size ) if ($send_buffer_size > 0);
unless ( $socket = IO::Socket::INET->new( @socket_args ) ) {
log_timed_message "Couldn't connect to $foundation_host:$foundation_port : $!";
}
else {
$socket->autoflush();
log_timed_message 'Output bundle local port: ', $socket->sockport() if $debug_summary;
$failed = 0;
# Here we set a send timeout. The right value is subject to discussion, given that it may depend
# on the current load of the receiver process. Compare this send timout with the receiver timeout,
# which is set as thread.timeout.idle in /usr/local/groundwork/config/foundation.properties .
unless ( $socket->sockopt(SO_SNDTIMEO, pack('L!L!', $socket_send_timeout, 0)) ) {
log_socket_problem ('setting send timeout on');
$failed = 1;
}
if ($debug_summary) {
my $send_buf = $socket->sockopt(SO_SNDBUF);
unless ( $send_buf >= 0 ) {
log_socket_problem ('getting send buffer size on');
$failed = 1;
}
log_timed_message 'Reported socket send buffer size: ', $send_buf;
}
if ($failed) {
unless ( close($socket) ) {
log_socket_problem ('closing');
}
$socket = undef;
}
}
return $socket;
}
# Close the socket, whether it was working or faulty.
sub close_socket {
my $socket = shift;
my $failed = shift;
unless ($failed) {
log_timed_message 'Writing close message to Foundation.' if $debug_summary;
unless ( $socket->print ($command_close) ) {
log_socket_problem ('writing to');
$failed = 1;
}
else {
LOG->print ($command_close, "\n\n") if $debug_xml;
}
}
# FIX FUTURE: This socket closing will invoke a write operation on any data still left hanging
# within Perl's own buffering of the data we wrote above. Generally, each of the writes above
# would have written all the data in the buffer before the write returned to this code. But some
# data can be left in the Perl buffers if the socket write times out. And now this close() will
# attempt to write that data, to a socket which is probably bungled, without the last previous
# write having successfully completed (but with the write pointer inexplicably updated in spite
# of the error) -- clearly a bad idea from the point of view of the downstream reader, who will
# now be faced with a corrupted data stream if this additional writing actually succeeds in
# transferring any data. So to minimize problems, we ought to figure out how to clear the Perl
# buffer before attempting the close() operation, if not all of the data got sent above. But we
# currently don't see any IO::Handle method that will carry out this $socket->clear() operation.
# The upshot is that any additional writes invoked here may also block and be subject to whatever
# SO_SNDTIMEO timeout we set above on the socket. (I suppose we could set that timeout here to
# just 1 microsecond, as the closest approximation to what we want, given the tools available.
# That won't actually prevent the extra write(s) from occurring, though.)
unless ( !$failed || $socket->sockopt(SO_SNDTIMEO, pack('L!L!', 0, 1)) ) {
log_socket_problem ('setting send timeout on');
}
# An error reported here might be due to an error writing whatever remains in the Perl i/o
# buffering. If that is true, then we should treat it just like a failure to write just
# above, and revert back to the beginning of this adapter packet and re-send the entire thing.
unless ( close($socket) ) {
log_socket_problem ('closing');
$failed = 1;
}
return !$failed;
}
sub output_bundle_to_socket {
my $msg_ref = shift;
my $series_num = shift;
my $socket;
my $failed = 1;
$socket = open_socket();
if ($socket) {
$failed = 0;
my $use_careful_sockets = 1;
my $use_efficient_sockets = 0;
if ($use_careful_sockets) {
## Efficient operation as below, except that we limit the total amount of data sent
## per connection, closing it and opening a new connection if we exceed that limit.
## Also, this code is able to accommodate a transient sending failure by retrying
## the failed operation.
my $next = 0;
my $last = -1;
my $last_index = $#$msg_ref;
my $element_begin = undef;
my $element_end = "\n";
my $elements;
my $bytes_per_connection = 0;
my $max_bytes_per_connection = 253952; # 256K - 8K, for initial testing
my $send_retries = 0;
my $max_send_retries = 3;
while ( $next <= $last_index ) {
$last = $next + $max_xml_bundle_size - 1;
$last = $last_index if $last > $last_index;
my $curr;
for ($curr = $next; $curr <= $last; ++$curr) {
$bytes_per_connection += length( $msg_ref->[$curr] );
last if ($bytes_per_connection > $max_bytes_per_connection);
}
--$curr;
$last = ($curr < $next) ? $next : $curr;
$series_num++;
$element_begin =
qq(\n\n);
$elements = join( '', $element_begin, @{$msg_ref}[ $next .. $last ], $element_end );
LOG->print ($elements, "\n") if $debug_xml && !$log_as_utf8;
utf8::encode($elements);
log_timed_message "Writing Adapter message (Session $series_num) to Foundation: ", length($elements), ' bytes.' if $debug_summary;
unless ( $socket->print ($elements) ) {
log_socket_problem ('writing to');
if (++$send_retries > $max_send_retries) {
log_timed_message 'Too many retries on socket writing -- will exit.';
}
else {
# Ignore errors on closing, as we already know the socket is faulty.
close_socket($socket, 1);
$socket = open_socket();
if ($socket) {
$bytes_per_connection = 0;
redo;
}
}
$failed = 1;
last;
}
if ($shutdown_requested) {
log_shutdown();
close_socket($socket, 0);
$socket = undef;
$failed = 1;
last;
}
LOG->print ($elements, "\n") if $debug_xml && $log_as_utf8;
if ($bytes_per_connection > $max_bytes_per_connection && $last < $last_index) {
# We've sent enough already on this particular connection, and there is
# still more data to send. Use a new connection for the remaining data.
my $clean_close = close_socket($socket, 0);
$socket = open_socket();
if (!$socket) {
$failed = 1;
last;
}
$send_retries = 0;
$bytes_per_connection = 0;
redo if !$clean_close;
}
$next = $last + 1;
}
}
elsif ($use_efficient_sockets) {
## Efficient operation, except that the underlying PerlIO buffering layer will
## break up our individual write actions here into actual max-4096-byte write()
## calls, thereby preventing the efficiency gains we aim for here. We have
## found no way to set the Perl buffering and write() sizes to a larger value.
my $next = 0;
my $last = -1;
my $last_index = $#$msg_ref;
my $element_begin = undef;
my $element_end = "\n";
my $elements;
while ( $next <= $last_index ) {
$last = $next + $max_xml_bundle_size - 1;
$last = $last_index if $last > $last_index;
$series_num++;
$element_begin =
qq(\n\n);
$elements = join( '', $element_begin, @{$msg_ref}[ $next .. $last ], $element_end );
LOG->print ($elements, "\n") if $debug_xml && !$log_as_utf8;
utf8::encode($elements);
log_timed_message "Writing Adapter message (Session $series_num) to Foundation: ", length($elements), ' bytes.' if $debug_summary;
unless ( $socket->print ($elements) ) {
log_socket_problem ('writing to');
$failed = 1;
last;
}
if ($shutdown_requested) {
log_shutdown();
$failed = 1;
last;
}
LOG->print ($elements, "\n") if $debug_xml && $log_as_utf8;
$next = $last + 1;
}
}
else {
## Legacy operation, now deprecated.
my $element_begin = undef;
my $element_end = "\n";
while (@{$msg_ref}) {
$series_num++;
$element_begin =
qq(\n\n);
log_timed_message 'Writing Adapter begin message to Foundation.' if $debug_summary;
unless ( $socket->print ($element_begin) ) {
log_socket_problem ('writing to');
$failed = 1;
last;
}
if ($shutdown_requested) {
log_shutdown();
$failed = 1;
last;
}
LOG->print ($element_begin, "\n") if $debug_xml;
my $num_messages_output = 0;
while ( @{$msg_ref} && $num_messages_output < $max_xml_bundle_size ) {
$num_messages_output++;
my $message = shift( @{$msg_ref} );
LOG->print ($message, "\n") if $debug_xml && !$log_as_utf8;
utf8::encode($message);
log_timed_message 'Writing Adapter body message to Foundation.' if $debug_summary;
unless ( $socket->print ($message) ) {
log_socket_problem ('writing to');
$failed = 1;
last;
}
if ($shutdown_requested) {
log_shutdown();
$failed = 1;
last;
}
LOG->print ($message, "\n") if $debug_xml && $log_as_utf8;
}
last if $failed;
log_timed_message 'Writing Adapter end message to Foundation.' if $debug_summary;
unless ( $socket->print ($element_end) ) {
log_socket_problem ('writing to');
$failed = 1;
last;
}
if ($shutdown_requested) {
log_shutdown();
$failed = 1;
last;
}
LOG->print ($element_end, "\n") if $debug_xml;
}
}
if ($socket) {
$failed |= !close_socket($socket, $failed);
}
}
return $failed ? -1 : $series_num;
}
sub send_pending_events {
my $series_num = shift;
my $max_bundle_size = shift;
my $failed = 0;
if ( scalar(@event_messages) >= $max_bundle_size ) {
my $socket;
$failed = 1;
for (my $attempts = 10; --$attempts >= 0; ) {
# SendBuf is an as-yet-undocumented patch to IO::Socket::INET.
my @socket_args = ( PeerAddr => $foundation_host, PeerPort => $foundation_port, Proto => 'tcp', Type => SOCK_STREAM );
push @socket_args, ( SendBuf => $send_buffer_size ) if ($send_buffer_size > 0);
if ( $socket = IO::Socket::INET->new( @socket_args ) ) {
$socket->autoflush();
log_timed_message 'Pending events local port: ', $socket->sockport() if $debug_summary;
$failed = 0;
last if $socket->sockopt(SO_SNDTIMEO, pack('L!L!', $socket_send_timeout, 0));
log_socket_problem ('setting send timeout on');
$failed = 1;
unless ( close($socket) ) {
log_socket_problem ('closing');
}
}
log_timed_message 'Cannot open a socket to the Foundation listener. Retrying in 2 seconds.';
sleep 2;
if ($shutdown_requested) {
log_shutdown();
last;
}
}
if ($failed) {
log_timed_message "Listener services not available. Restarting in $failure_sleep_time seconds.";
sleep $failure_sleep_time;
}
else {
## Assemble XML for sending to Foundation.
$series_num++;
my $element_begin = qq(\n);
my $element_end = "\n";
my $elements = join( "\n", $element_begin, @event_messages, $element_end, $command_close );
LOG->print ($elements, "\n") if $debug_xml && !$log_as_utf8;
utf8::encode($elements);
log_timed_message 'Writing events message to Foundation.' if $debug_summary;
unless ( $socket->print ($elements) ) {
log_socket_problem ('writing to');
$failed = 1;
}
else {
LOG->print ($elements, "\n") if $debug_xml && $log_as_utf8;
}
unless ( close($socket) ) {
log_socket_problem ('closing');
$failed = 1;
}
## Here we don't discard messages we could not send.
## That means they will build up indefinitely until we do.
if ( !$failed ) {
@event_messages = ();
}
}
}
if ($shutdown_requested) {
log_shutdown();
$failed = 1;
}
return $failed ? -1 : $series_num;
}
# FIX MINOR: drop v1/v2 support here
sub get_status {
my $statusfile = shift;
my $version = shift;
if ( $version == 3 ) {
return get_status_v3($statusfile);
}
if ( $version == 2 ) {
return get_status_v2($statusfile);
}
if ( $version == 1 ) {
return get_status_v1($statusfile);
}
print LOG "$0 error: unknown Nagios version: [$version]\n";
sleep $failure_sleep_time;
return undef;
}
# FIX MINOR: drop v1/v2 support here
sub get_status_v1 {
my $statusfile = shift;
my ( $timestamp, $msgtype );
my @field;
my $element_ref;
# FIX FUTURE: don't just abort on failure; retry 3 times or so
if ( !open( STATUSFILE, '<:unix:mmap', $statusfile ) ) {
freeze_logtime();
print "Error opening file $statusfile: $!\n";
print LOG "${logtime}Error opening file $statusfile: $!\n";
sleep $failure_sleep_time;
return undef;
}
while ( my $line = ) {
# [1100304091] HOST;Application_1;UP;1100304086;1100280796;0;7462261;6887;36466;1100280796;0;1;1;1;1;0;0.00;0;1;1;PING OK - Packet loss = 0%, RTA = 25.22 ms
if ( $line =~ /^\s*\#]/ ) { next; }
@field = split /;/, $line;
if ( $field[0] =~ /\[(\d+)\] (.*)/ ) {
$timestamp = $1;
$msgtype = $2;
}
else {
next;
}
# Use Collage database field names as service keys
my $el_host = \%{ $element_ref->{Host}->{ $field[1] } };
if ( $msgtype =~ /SERVICE/ ) {
my $el_svc = \%{ $el_host->{Service}->{ $field[2] } };
if ( $field[6] == 0 ) { $field[6] = time; }
if ( $field[12] == 0 ) { $field[12] = time; }
$field[31] =~ s/\n/ /g;
$field[31] =~ s/\f/ /g;
$field[31] =~ s/
/ /ig;
$field[31] =~ s/&/&/g;
$field[31] =~ s/"/"/g;
$field[31] =~ s/'/'/g;
$field[31] =~ s/</g;
$field[31] =~ s/>/>/g;
# $el_svc->{RetryNumber} = '1'; #$field[4];
my $tmp = $field[4];
if ( $tmp =~ /(\d+)\/(\d+)/ ) {
my $RetryNumber = $1;
my $MaxTry = $2;
$el_svc->{RetryNumber} = $RetryNumber;
}
$el_svc->{MonitorStatus} = $field[3];
$el_svc->{StateType} = $field[5];
$el_svc->{LastCheckTime} = time_text( $field[6] );
$el_svc->{NextCheckTime} = time_text( $field[7] );
$el_svc->{CheckType} = $field[8];
$el_svc->{isChecksEnabled} = $field[9];
$el_svc->{isAcceptPassiveChecks} = $field[10];
$el_svc->{isEventHandlersEnabled} = $field[11];
$el_svc->{LastStateChange} = time_text( $field[12] );
$el_svc->{isProblemAcknowledged} = $field[13];
$el_svc->{LastHardState} = $field[14];
$el_svc->{TimeOK} = $field[15];
$el_svc->{TimeUnknown} = $field[16];
$el_svc->{TimeWarning} = $field[17];
$el_svc->{TimeCritical} = $field[18];
$el_svc->{LastNotificationTime} = time_text( $field[19] );
$el_svc->{CurrentNotificationNumber} = $field[20];
$el_svc->{isNotificationsEnabled} = $field[21];
$el_svc->{Latency} = $field[22];
$el_svc->{ExecutionTime} = $field[23];
$el_svc->{isFlapDetectionEnabled} = $field[24];
$el_svc->{isServiceFlapping} = $field[25];
$el_svc->{PercentStateChange} = $field[26];
$el_svc->{ScheduledDowntimeDepth} = $field[27];
$el_svc->{isFailurePredictionEnabled} = $field[28];
$el_svc->{isProcessPerformanceData} = $field[29];
$el_svc->{isObsessOverService} = $field[30];
$el_svc->{LastPluginOutput} = $field[31];
}
elsif ( $msgtype =~ /HOST/ ) {
if ( $field[3] == 0 ) { $field[3] = time; }
if ( $field[4] == 0 ) { $field[4] = time; }
$field[20] =~ s/\n/ /g;
$field[20] =~ s/\f/ /g;
$field[20] =~ s/
/ /ig;
$field[20] =~ s/&/&/g;
$field[20] =~ s/"/"/g;
$field[20] =~ s/'/'/g;
$field[20] =~ s/</g;
$field[20] =~ s/>/>/g;
$el_host->{MonitorStatus} = $field[2];
$el_host->{LastCheckTime} = time_text( $field[3] );
$el_host->{LastStateChange} = time_text( $field[4] );
$el_host->{isAcknowledged} = $field[5];
$el_host->{TimeUp} = $field[6];
$el_host->{TimeDown} = $field[7];
$el_host->{TimeUnreachable} = $field[8];
$el_host->{LastNotificationTime} = time_text( $field[9] );
$el_host->{CurrentNotificationNumber} = $field[10];
$el_host->{isNotificationsEnabled} = $field[11];
$el_host->{isEventHandlersEnabled} = $field[12];
$el_host->{isChecksEnabled} = $field[13];
$el_host->{isFlapDetectionEnabled} = $field[14];
$el_host->{isHostIsFlapping} = $field[15];
$el_host->{PercentStateChange} = $field[16];
$el_host->{ScheduledDowntimeDepth} = $field[17];
$el_host->{isFailurePredictionEnabled} = $field[18];
$el_host->{isProcessPerformanceData} = $field[19];
$el_host->{LastPluginOutput} = $field[20];
}
elsif ( $msgtype =~ /PROGRAM/ ) {
}
}
close STATUSFILE;
return $element_ref;
}
# FIX MINOR: drop v1/v2 support here
sub get_status_v2 {
my $statusfile = shift;
my ( $timestamp, $msgtype );
my @field;
my $element_ref;
# FIX FUTURE: don't just abort on failure; retry 3 times or so
if ( !open( STATUSFILE, '<:unix:mmap', $statusfile ) ) {
freeze_logtime();
print "Error opening file $statusfile: $!\n";
print LOG "${logtime}Error opening file $statusfile: $!\n";
sleep $failure_sleep_time;
return undef;
}
my $state = '';
my %attribute = ();
while ( my $line = ) {
chomp $line;
if ( $line =~ /^\s*\#]/ ) { next; }
if ( !$state and ( $line =~ /\s*host \{/ ) ) {
$state = 'Host';
next;
}
elsif ( !$state and ( $line =~ /\s*service \{/ ) ) {
$state = 'Service';
next;
}
elsif ( ( $state eq 'Service' ) and ( $line =~ /^\s*\}/ ) and $attribute{host_name} and $attribute{service_description} ) {
my $el_svc = \%{ $element_ref->{Host}->{ $attribute{host_name} }->{Service}->{ $attribute{service_description} } };
if ( ( $attribute{last_check} == 0 ) and ( $attribute{has_been_checked} == 0 ) ) {
## $attribute{last_check} = time;
$el_svc->{MonitorStatus} = 'PENDING';
}
else {
$el_svc->{MonitorStatus} = $ServiceStatus{ $attribute{current_state} };
}
# Set element hash
# Map Nagios V2 status parameters to Nagios V1 definitions in Collage
$el_svc->{StateType} = $StateType{ $attribute{state_type} };
$el_svc->{RetryNumber} = $attribute{current_attempt};
## if ($attribute{last_check} == 0) { $attribute{last_check} = time; }
if ($attribute{plugin_output}) {
$attribute{plugin_output} =~ s/\n/ /g;
$attribute{plugin_output} =~ s/\f/ /g;
$attribute{plugin_output} =~ s/
/ /ig;
$attribute{plugin_output} =~ s/&/&/g;
$attribute{plugin_output} =~ s/"/"/g;
$attribute{plugin_output} =~ s/'/'/g;
$attribute{plugin_output} =~ s/</g;
$attribute{plugin_output} =~ s/>/>/g;
}
if ( $attribute{last_state_change} == 0 ) { $attribute{last_state_change} = time; }
## Collage expects latency in integer. Set to ms
$attribute{check_latency} = int( 1000 * $attribute{check_latency} );
## Collage expects execution time in integer. Set to ms
$attribute{check_execution_time} = int( 1000 * $attribute{check_execution_time} );
$el_svc->{CheckType} = $CheckType{ $attribute{check_type} };
$el_svc->{CurrentNotificationNumber} = $attribute{current_notification_number};
$el_svc->{ExecutionTime} = $attribute{check_execution_time};
$el_svc->{LastCheckTime} = time_text( $attribute{last_check} );
$el_svc->{LastHardState} = $ServiceStatus{ $attribute{last_hard_state} };
$el_svc->{LastNotificationTime} = time_text( $attribute{last_notification} );
$el_svc->{LastPluginOutput} = $attribute{plugin_output};
$el_svc->{LastStateChange} = time_text( $attribute{last_state_change} );
$el_svc->{Latency} = $attribute{check_latency};
$el_svc->{NextCheckTime} = time_text( $attribute{next_check} );
$el_svc->{PercentStateChange} = $attribute{percent_state_change};
$el_svc->{ScheduledDowntimeDepth} = $attribute{scheduled_downtime_depth};
$el_svc->{TimeCritical} = $attribute{last_time_critical};
$el_svc->{TimeOK} = $attribute{last_time_ok};
$el_svc->{TimeUnknown} = $attribute{last_time_unknown};
$el_svc->{TimeWarning} = $attribute{last_time_warning};
$el_svc->{isAcceptPassiveChecks} = $attribute{passive_checks_enabled};
$el_svc->{isChecksEnabled} = $attribute{active_checks_enabled};
$el_svc->{isEventHandlersEnabled} = $attribute{event_handler_enabled};
$el_svc->{isFailurePredictionEnabled} = $attribute{failure_prediction_enabled};
$el_svc->{isFlapDetectionEnabled} = $attribute{flap_detection_enabled};
$el_svc->{isNotificationsEnabled} = $attribute{notifications_enabled};
$el_svc->{isObsessOverService} = $attribute{obsess_over_service};
$el_svc->{isProblemAcknowledged} = $attribute{problem_has_been_acknowledged};
$el_svc->{isProcessPerformanceData} = $attribute{process_performance_data};
$el_svc->{isServiceFlapping} = $attribute{is_flapping};
# reset variables for next object
$state = '';
%attribute = ();
next;
}
elsif ( ( $state eq 'Host' ) and ( $line =~ /\s*\}/ ) and $attribute{host_name} ) {
my $el_host = \%{ $element_ref->{Host}->{ $attribute{host_name} } };
if ($attribute{plugin_output}) {
$attribute{plugin_output} =~ s/\n/ /g;
$attribute{plugin_output} =~ s/\f/ /g;
$attribute{plugin_output} =~ s/
/ /ig;
$attribute{plugin_output} =~ s/&/&/g;
$attribute{plugin_output} =~ s/"/"/g;
$attribute{plugin_output} =~ s/'/'/g;
$attribute{plugin_output} =~ s/</g;
$attribute{plugin_output} =~ s/>/>/g;
}
if ( ( $attribute{last_check} == 0 ) and ( $attribute{has_been_checked} == 0 ) ) {
## $attribute{last_check} = time;
$el_host->{MonitorStatus} = 'PENDING';
}
else {
$el_host->{MonitorStatus} = $HostStatus{ $attribute{current_state} };
}
if ( $attribute{last_state_change} == 0 ) { $attribute{last_state_change} = time; }
$el_host->{CheckType} = $CheckType{ $attribute{check_type} };
$el_host->{CurrentNotificationNumber} = $attribute{current_notification_number};
$el_host->{LastCheckTime} = time_text( $attribute{last_check} );
$el_host->{LastNotificationTime} = time_text( $attribute{last_notification} );
$el_host->{LastPluginOutput} = $attribute{plugin_output};
$el_host->{LastStateChange} = time_text( $attribute{last_state_change} );
$el_host->{PercentStateChange} = $attribute{percent_state_change};
$el_host->{ScheduledDowntimeDepth} = $attribute{scheduled_downtime_depth};
$el_host->{TimeDown} = $attribute{last_time_down};
$el_host->{TimeUnreachable} = $attribute{last_time_unreachable};
$el_host->{TimeUp} = $attribute{last_time_up};
$el_host->{isAcknowledged} = $attribute{problem_has_been_acknowledged};
$el_host->{isChecksEnabled} = $attribute{active_checks_enabled};
$el_host->{isEventHandlersEnabled} = $attribute{event_handler_enabled};
$el_host->{isFailurePredictionEnabled} = $attribute{failure_prediction_enabled};
$el_host->{isFlapDetectionEnabled} = $attribute{flap_detection_enabled};
$el_host->{isHostFlapping} = $attribute{is_flapping};
$el_host->{isNotificationsEnabled} = $attribute{notifications_enabled};
$el_host->{isPassiveChecksEnabled} = $attribute{passive_checks_enabled};
$el_host->{isProcessPerformanceData} = $attribute{process_performance_data};
# reset variables for next object
$state = '';
%attribute = ();
next;
}
if ( $state and ( $line =~ /\s*(\S+?)=(.*)/ ) ) {
if ( $2 ne '' ) {
$attribute{$1} = $2;
}
}
else { next; }
}
close STATUSFILE;
return $element_ref;
}
sub get_status_v3 {
my $statusfile = shift;
my ( $timestamp, $msgtype );
my @field;
my $element_ref;
# FIX FUTURE: don't just abort on failure; retry 3 times or so
if ( !open( STATUSFILE, '<:unix:mmap', $statusfile ) ) {
freeze_logtime();
print "Error opening file $statusfile: $!\n";
print LOG "${logtime}Error opening file $statusfile: $!\n";
sleep $failure_sleep_time;
return undef;
}
my $state = '';
my $hostcomment = undef;
my $servicecomment = undef;
my %attribute = ();
while ( my $line = ) {
if ($shutdown_requested) {
return undef;
}
chomp $line;
if ( $line =~ /^\s*\#]/ ) { next; }
if ( !$state and ( $line =~ /\s*host(?:status)?\s*\{/ ) ) {
$state = 'Host';
$n_hostcount++;
next;
}
elsif ( !$state and ( $line =~ /\s*service(?:status)?\s*\{/ ) ) {
$state = 'Service';
$n_servicecount++;
next;
}
elsif ( ( $state eq 'Service' ) and ( $line =~ /^\s*\}/ ) and $attribute{host_name} and $attribute{service_description} ) {
my $el_svc = \%{ $element_ref->{Host}->{ $attribute{host_name} }->{Service}->{ $attribute{service_description} } };
# Check for pending service status
if ( ( $attribute{last_check} == 0 ) and ( $attribute{has_been_checked} == 0 ) ) {
$el_svc->{MonitorStatus} = 'PENDING';
}
else {
$el_svc->{MonitorStatus} = $ServiceStatus{ $attribute{current_state} };
}
if ($attribute{plugin_output}) {
$attribute{plugin_output} =~ s/\n/ /g;
$attribute{plugin_output} =~ s/\f/ /g;
$attribute{plugin_output} =~ s/
/ /ig;
$attribute{plugin_output} =~ s/&/&/g;
$attribute{plugin_output} =~ s/"/"/g;
$attribute{plugin_output} =~ s/'/'/g;
$attribute{plugin_output} =~ s/</g;
$attribute{plugin_output} =~ s/>/>/g;
}
if ($attribute{long_plugin_output}) {
$attribute{long_plugin_output} =~ s/\n/ /g;
$attribute{long_plugin_output} =~ s/\f/ /g;
$attribute{long_plugin_output} =~ s/
/ /ig;
$attribute{long_plugin_output} =~ s/&/&/g;
$attribute{long_plugin_output} =~ s/"/"/g;
$attribute{long_plugin_output} =~ s/'/'/g;
$attribute{long_plugin_output} =~ s/</g;
$attribute{long_plugin_output} =~ s/>/>/g;
}
if ( $attribute{last_state_change} == 0 ) { $attribute{last_state_change} = time; }
## Collage expects latency in integer. Set to ms
$attribute{check_latency} = int( 1000 * $attribute{check_latency} );
## Collage expects execution time in integer. Set to ms
$attribute{check_execution_time} = int( 1000 * $attribute{check_execution_time} );
my $short_output = $attribute{plugin_output};
my $long_output = $attribute{long_plugin_output};
my $plugin_output =
(defined($short_output) && defined($long_output)) ? "$short_output $long_output" :
defined($short_output) ? $short_output : $long_output;
# Set element hash
# Map Nagios V2 status parameters to Nagios V1 definitions in Collage
$el_svc->{CheckType} = $CheckType{ $attribute{check_type} };
$el_svc->{CurrentAttempt} = $attribute{current_attempt};
$el_svc->{CurrentNotificationNumber} = $attribute{current_notification_number};
$el_svc->{ExecutionTime} = $attribute{check_execution_time};
$el_svc->{LastCheckTime} = time_text( $attribute{last_check} );
$el_svc->{LastHardState} = $ServiceStatus{ $attribute{last_hard_state} };
$el_svc->{LastNotificationTime} = time_text( $attribute{last_notification} );
$el_svc->{LastPluginOutput} = $plugin_output;
$el_svc->{LastStateChange} = time_text( $attribute{last_state_change} );
$el_svc->{Latency} = $attribute{check_latency};
$el_svc->{MaxAttempts} = $attribute{max_attempts};
$el_svc->{NextCheckTime} = time_text( $attribute{next_check} );
$el_svc->{PercentStateChange} = $attribute{percent_state_change};
## FIX MINOR: drop support for RetryNumber, as it just duplicates CurrentAttempt and is no longer used later on
## $el_svc->{RetryNumber} = $attribute{current_attempt};
$el_svc->{ScheduledDowntimeDepth} = $attribute{scheduled_downtime_depth};
$el_svc->{StateType} = $StateType{ $attribute{state_type} };
$el_svc->{TimeCritical} = $attribute{last_time_critical};
$el_svc->{TimeOK} = $attribute{last_time_ok};
$el_svc->{TimeUnknown} = $attribute{last_time_unknown};
$el_svc->{TimeWarning} = $attribute{last_time_warning};
$el_svc->{isAcceptPassiveChecks} = $attribute{passive_checks_enabled};
$el_svc->{isChecksEnabled} = $attribute{active_checks_enabled};
$el_svc->{isEventHandlersEnabled} = $attribute{event_handler_enabled};
$el_svc->{isFailurePredictionEnabled} = $attribute{failure_prediction_enabled};
$el_svc->{isFlapDetectionEnabled} = $attribute{flap_detection_enabled};
$el_svc->{isNotificationsEnabled} = $attribute{notifications_enabled};
$el_svc->{isObsessOverService} = $attribute{obsess_over_service};
$el_svc->{isProblemAcknowledged} = $attribute{problem_has_been_acknowledged};
$el_svc->{isProcessPerformanceData} = $attribute{process_performance_data};
$el_svc->{isServiceFlapping} = $attribute{is_flapping};
## Use global values to overide where needed
## Obsession
if ( $global_nagios->{obsess_over_services} == 0 ) {
$el_svc->{isObsessOverService} = 0;
}
## Notifications
if ( $global_nagios->{enable_notifications} == 0 ) {
$el_svc->{isNotificationsEnabled} = 0;
}
## Active Checks
if ( $global_nagios->{active_service_checks_enabled} == 0 ) {
$el_svc->{isChecksEnabled} = 0;
}
## Passive Checks
if ( $global_nagios->{passive_service_checks_enabled} == 0 ) {
$el_svc->{isAcceptPassiveChecks} = 0;
}
## Flap Detection
if ( $global_nagios->{enable_flap_detection} == 0 ) {
$el_svc->{isFlapDetectionEnabled} = 0;
}
## Event Handlers
if ( $global_nagios->{enable_event_handlers} == 0 ) {
$el_svc->{isEventHandlersEnabled} = 0;
}
## reset variables for next object
$state = '';
%attribute = ();
next;
}
elsif ( ( $state eq 'Host' ) and ( $line =~ /\s*\}/ ) and $attribute{host_name} ) {
my $el_host = \%{ $element_ref->{Host}->{ $attribute{host_name} } };
if ($attribute{plugin_output}) {
$attribute{plugin_output} =~ s/\n/ /g;
$attribute{plugin_output} =~ s/\f/ /g;
$attribute{plugin_output} =~ s/
/ /ig;
$attribute{plugin_output} =~ s/&/&/g;
$attribute{plugin_output} =~ s/"/"/g;
$attribute{plugin_output} =~ s/'/'/g;
$attribute{plugin_output} =~ s/</g;
$attribute{plugin_output} =~ s/>/>/g;
}
if ( ( $attribute{last_check} == 0 ) and ( $attribute{has_been_checked} == 0 ) ) {
## $attribute{last_check} = time;
$el_host->{MonitorStatus} = 'PENDING';
}
else {
$el_host->{MonitorStatus} = $HostStatus{ $attribute{current_state} };
}
if ( $attribute{last_state_change} == 0 ) { $attribute{last_state_change} = time; }
## Collage expects latency in integer. Set to ms
$attribute{check_latency} = int( 1000 * $attribute{check_latency} );
## Collage expects execution time in integer. Set to ms
$attribute{check_execution_time} = int( 1000 * $attribute{check_execution_time} );
$el_host->{CheckType} = $CheckType{ $attribute{check_type} };
$el_host->{CurrentAttempt} = $attribute{current_attempt};
$el_host->{CurrentNotificationNumber} = $attribute{current_notification_number};
$el_host->{ExecutionTime} = $attribute{check_execution_time};
$el_host->{LastCheckTime} = time_text( $attribute{last_check} );
$el_host->{LastNotificationTime} = time_text( $attribute{last_notification} );
$el_host->{LastPluginOutput} = $attribute{plugin_output};
$el_host->{LastStateChange} = time_text( $attribute{last_state_change} );
$el_host->{Latency} = $attribute{check_latency};
$el_host->{MaxAttempts} = $attribute{max_attempts};
$el_host->{NextCheckTime} = time_text( $attribute{next_check} );
$el_host->{PercentStateChange} = $attribute{percent_state_change};
$el_host->{ScheduledDowntimeDepth} = $attribute{scheduled_downtime_depth};
$el_host->{StateType} = $StateType{ $attribute{state_type} };
$el_host->{TimeDown} = $attribute{last_time_down};
$el_host->{TimeUnreachable} = $attribute{last_time_unreachable};
$el_host->{TimeUp} = $attribute{last_time_up};
$el_host->{isAcknowledged} = $attribute{problem_has_been_acknowledged};
$el_host->{isChecksEnabled} = $attribute{active_checks_enabled};
$el_host->{isEventHandlersEnabled} = $attribute{event_handler_enabled};
$el_host->{isFailurePredictionEnabled} = $attribute{failure_prediction_enabled};
$el_host->{isFlapDetectionEnabled} = $attribute{flap_detection_enabled};
$el_host->{isHostFlapping} = $attribute{is_flapping};
$el_host->{isNotificationsEnabled} = $attribute{notifications_enabled};
$el_host->{isObsessOverHost} = $attribute{obsess_over_host};
$el_host->{isPassiveChecksEnabled} = $attribute{passive_checks_enabled};
$el_host->{isProcessPerformanceData} = $attribute{process_performance_data};
## Use global values where needed
## Obsession
if ( $global_nagios->{obsess_over_hosts} == 0 ) {
$el_host->{isObsessOverHost} = 0;
}
## Notifications
if ( $global_nagios->{enable_notifications} == 0 ) {
$el_host->{isNotificationsEnabled} = 0;
}
## Active Checks
if ( $global_nagios->{active_host_checks_enabled} == 0 ) {
$el_host->{isChecksEnabled} = 0;
}
## Passive Checks
if ( $global_nagios->{passive_host_checks_enabled} == 0 ) {
$el_host->{isPassiveChecksEnabled} = 0;
}
## Flap Detection
if ( $global_nagios->{enable_flap_detection} == 0 ) {
$el_host->{isFlapDetectionEnabled} = 0;
}
## Event Handlers
if ( $global_nagios->{enable_event_handlers} == 0 ) {
$el_host->{isEventHandlersEnabled} = 0;
}
# reset variables for next object
$state = '';
%attribute = ();
next;
}
if ( $state and ( $line =~ /\s*(\S+?)=(.*)/ ) ) {
if ( $2 ne '' ) {
$attribute{$1} = $2;
}
}
if ( $line =~ /\s*hostcomment\s*\{/ ) {
$hostcomment = 1;
next;
}
elsif ( $line =~ /\s*servicecomment\s*\{/ ) {
$servicecomment = 1;
next;
}
elsif ( $hostcomment and ( $line =~ /\s*(\S+?)=(.*)/ ) ) {
if ( $2 ne '' ) {
$attribute{$1} = $2;
}
}
elsif ( $hostcomment and ( $line =~ /\s*\}/ ) and $attribute{host_name} ) {
## Assign host comment attributes
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime( $attribute{entry_time} );
my $entrytime = sprintf '%02d-%02d-%4d %02d:%02d:%02d', $mon + 1, $mday, $year + 1900, $hour, $min, $sec;
$attribute{comment_data} =~ s/'//g;
$attribute{comment_data} =~ s/"//g;
$element_ref->{Host}->{ $attribute{host_name} }->{Comments} .=
"#!#$attribute{comment_id};::;$entrytime;::;$attribute{author};::;\'$attribute{comment_data}\'";
$hostcomment = undef;
}
elsif ( $servicecomment and ( $line =~ /\s*(\S+?)=(.*)/ ) ) {
if ( $2 ne '' ) {
$attribute{$1} = $2;
}
}
elsif ( $servicecomment and ( $line =~ /\s*\}/ ) and $attribute{host_name} ) {
## Assign service comment attributes
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime( $attribute{entry_time} );
my $entrytime = sprintf '%02d-%02d-%4d %02d:%02d:%02d', $mon + 1, $mday, $year + 1900, $hour, $min, $sec;
$attribute{comment_data} =~ s/'//g;
$attribute{comment_data} =~ s/"//g;
$element_ref->{Host}->{ $attribute{host_name} }->{Service}->{ $attribute{service_description} }->{Comments} .=
"#!#$attribute{comment_id};::;$entrytime;::;$attribute{author};::;\'$attribute{comment_data}\'";
$servicecomment = undef;
}
else { next; }
}
close STATUSFILE;
# Fix all the comments (once)
my $comment = undef;
foreach my $hostkey ( keys( %{ $element_ref->{Host} } ) ) {
my $el_host = \%{ $element_ref->{Host}->{$hostkey} };
$comment = $el_host->{Comments};
if ( defined $comment ) {
$comment =~ s/\n/ /g;
$comment =~ s/\f/ /g;
$comment =~ s/
/ /ig;
$comment =~ s/&/&/g;
$comment =~ s/"/"/g;
$comment =~ s/'/'/g;
$comment =~ s/</g;
$comment =~ s/>/>/g;
$el_host->{Comments} = $comment;
print LOG "*** Host Comments for host $hostkey: $comment\n" if $debug_debug;
}
else {
$el_host->{Comments} = ' ';
}
foreach my $servicekey ( keys( %{ $el_host->{Service} } ) ) {
my $el_svc = \%{ $el_host->{Service}->{$servicekey} };
$comment = $el_svc->{Comments};
if ( defined $comment ) {
$comment =~ s/\n/ /g;
$comment =~ s/\f/ /g;
$comment =~ s/
/ /ig;
$comment =~ s/&/&/g;
$comment =~ s/"/"/g;
$comment =~ s/'/'/g;
$comment =~ s/</g;
$comment =~ s/>/>/g;
$el_svc->{Comments} = $comment;
print LOG "*** Service Comments for host $hostkey, service $servicekey: $comment\n" if $debug_debug;
}
else {
$el_svc->{Comments} = ' ';
}
}
}
return $element_ref;
}
sub get_globals {
my $statusfile = shift;
my ( $timestamp, $msgtype );
my @field;
# FIX FUTURE: don't just abort on failure; retry 3 times or so
if ( !open( STATUSFILE, '<:unix:mmap', $statusfile ) ) {
freeze_logtime();
print "Error opening file $statusfile: $!\n";
print LOG "${logtime}Error opening file $statusfile: $!\n";
sleep $failure_sleep_time;
return undef;
}
my $state = '';
my $attribute = {};
while ( my $line = ) {
chomp $line;
if ( $line =~ /^\s*\#]/ ) { next; }
if ( !$state and ( $line =~ /\s*program(?:status)?\s*\{/ ) ) {
$state = 'Global';
next;
}
## Reading the globals in ...
if ( $state and ( $line =~ /\s*(\S+?)=(.*)/ ) ) {
if ( $2 ne '' ) {
$attribute->{$1} = $2;
print LOG "Global Attribute found: $1 = $2\n" if $debug_debug;
}
}
if ( $state and $line =~ /\s*\}/ ) {
# we are done reading globals
last;
}
}
close STATUSFILE;
return $attribute;
}
# This routine is no longer called from anywhere.
sub readNagiosfeedersConfig {
my $type = shift;
my $database = undef;
my $dbhost = undef;
my $username = undef;
my $password = undef;
my $gwconfigfile = '/usr/local/groundwork/config/db.properties';
if ( $type !~ /^(collage|insightreports)$/ ) { return 'ERROR: Invalid database type.'; }
if ( !open( CONFIG, "$gwconfigfile" ) ) {
return "ERROR: Unable to find configuration file $gwconfigfile";
}
## collage.username=collage
## collage.password=gwrk
## collage.database=GWCollageDB
## collage.dbhost = localhost
while ( my $line = ) {
chomp $line;
if ( $line =~ /\s*$type\.(\S+)\s*=\s*(\S*)\s*/ ) {
if ( $1 eq 'username' ) {
$username = $2;
}
elsif ( $1 eq 'password' ) {
$password = $2;
}
elsif ( $1 eq 'database' ) {
$database = $2;
}
elsif ( $1 eq 'dbhost' ) {
$dbhost = $2;
}
}
}
close CONFIG;
return ( $database, $dbhost, $username, $password );
}
sub build_host_xml {
my $thisnagios = shift;
my $element_ref = shift;
my $collage_ref = shift;
my $state_changes = shift; # arrayref or undef
my $insertcount = 0;
my $skipcount = 0;
my @output = ();
my %HostStatusCodes = ( '2' => 'UP', '4' => 'DOWN', '8' => 'UNREACHABLE' );
my $el_hosts = $element_ref->{Host};
my $cs_hosts = $collage_ref->{Host};
# Create XML stream -- Format:
# <{SERVICE_STATUS | HOST_STATUS | LOG_MESSAGE} database field=value | database field=value | ... />
#
foreach my $hostkey ( keys %{ $el_hosts } ) {
if ($shutdown_requested) {
log_shutdown();
return undef;
}
# if no host status change then don't send
my $host_xml = '';
if ($smart_update) {
$host_xml = hostStatusChangeXML( $el_hosts, $cs_hosts, $hostkey, $state_changes );
if ( !$host_xml ) {
$skipcount++;
next;
}
if ( $host_xml eq $restart_xml ) {
return undef;
}
}
my @xml_message = ();
push @xml_message, '{$hostkey};
foreach my $field ( keys %{ $el_host } ) {
if ( $field eq 'Service' ) { next } # skip the Service hash key
my $tmpinfo = $el_host->{$field};
$tmpinfo =~ s/"/'/g;
push @xml_message, "$field=\"$tmpinfo\" ";
}
}
push @xml_message, "/>\n";
push( @output, join( '', @xml_message ) );
if ($smart_update) {
hostStatusUpdate( $element_ref, $collage_ref, $hostkey );
}
$insertcount++;
if ( ( $insertcount % 100 ) == 0 ) {
print "Queueing hosts for insert, count=$insertcount\n" if $debug_summary;
print LOG "Queueing hosts for insert, count=$insertcount\n" if $debug_summary;
}
}
freeze_logtime();
if ($smart_update) {
print "${logtime}Total Hosts Queued for Insert Count=$insertcount. No status change for $skipcount hosts.\n" if $debug_summary;
print LOG "${logtime}Total Hosts Queued for Insert Count=$insertcount. No status change for $skipcount hosts.\n" if $debug_summary;
}
else {
print "${logtime}Total Hosts Queued for Insert Count=$insertcount.\n" if $debug_summary;
print LOG "${logtime}Total Hosts Queued for Insert Count=$insertcount.\n" if $debug_summary;
}
return \@output;
}
sub build_service_xml {
my $thisnagios = shift;
my $element_ref = shift;
my $collage_ref = shift;
my $state_changes = shift; # arrayref or undef
my $insertcount = 0;
my $skipcount = 0;
my @output = ();
my $el_hosts = $element_ref->{Host};
my $cs_hosts = $collage_ref->{Host};
# Create XML stream -- Format:
# <{SERVICE_STATUS | HOST_STATUS | LOG_MESSAGE} database field=value | database field=value | ... />
#
foreach my $hostkey ( keys %{$el_hosts} ) {
if ($shutdown_requested) {
log_shutdown();
return undef;
}
my $el_svcs = $el_hosts->{$hostkey}->{Service};
my $cs_svcs = $cs_hosts->{$hostkey}->{Service};
foreach my $servicekey ( keys %{$el_svcs} ) {
# if no service status change, then don't send
my $service_xml = '';
if ($smart_update) {
$service_xml = serviceStatusChangeXML( $el_svcs, $cs_svcs, $hostkey, $servicekey, $state_changes );
if ( !$service_xml ) {
$skipcount++;
next;
}
if ( $service_xml eq $restart_xml ) {
return undef;
}
}
my @xml_message = ();
push @xml_message, '{$servicekey};
foreach my $field ( keys %{$el_svc} ) {
my $tmpinfo = $el_svc->{$field};
$tmpinfo =~ s/"/'/g;
push @xml_message, "$field=\"$tmpinfo\" ";
}
}
push @xml_message, "/>\n";
push( @output, join( '', @xml_message ) );
if ($smart_update) {
serviceStatusUpdate( $element_ref, $collage_ref, $hostkey, $servicekey );
}
$insertcount++;
if ( ( $insertcount % 100 ) == 0 ) {
print "Queueing services for insert, count=$insertcount\n" if $debug_summary;
print LOG "Queueing services for insert, count=$insertcount\n" if $debug_summary;
}
}
}
freeze_logtime();
if ($smart_update) {
print "${logtime}Total Services Queued for Insert Count=$insertcount. No status change for $skipcount services.\n" if $debug_summary;
print LOG "${logtime}Total Services Queued for Insert Count=$insertcount. No status change for $skipcount services.\n" if $debug_summary;
}
else {
print "${logtime}Total Services Queued for Insert Count=$insertcount.\n" if $debug_summary;
print LOG "${logtime}Total Services Queued for Insert Count=$insertcount.\n" if $debug_summary;
}
return \@output;
}
sub push_host_state_change {
my $host = shift;
my $el_host = shift;
my $state_changes = shift;
my $el_status = $el_host->{MonitorStatus};
if ( defined($el_status) && $el_status !~ /PENDING/ ) {
my $check_state = ( $el_status =~ /UP/ ) ? 0 : 1;
## Reverse the XML Substitution needed for Foundation in the status text.
my $host_text = $el_host->{LastPluginOutput};
$host_text =~ s/>/>/g;
$host_text =~ s/</{MonitorStatus};
if ( defined($el_status) && $el_status !~ /PENDING/ ) {
my $check_state = ( $el_status =~ /OK/ ) ? 0 : ( $el_status =~ /WARNING/ ) ? 1 : ( $el_status =~ /CRITICAL/ ) ? 2 : 3;
my $service_text = $el_svc->{LastPluginOutput};
$service_text =~ s/>/>/g;
$service_text =~ s/</{$hostkey};
my $cs_host = $cs_hosts->{$hostkey};
my $data_change = 0;
my $el_host_field;
my $cs_host_field;
# We always need these fields if we send any XML (GWMON-7684) ...
foreach my $field qw(
MonitorStatus
ScheduledDowntimeDepth
LastStateChange
) {
$el_host_field = $el_host->{$field}; $el_host_field = '' if not defined $el_host_field;
$cs_host_field = $cs_host->{$field}; $cs_host_field = '' if not defined $cs_host_field;
my $tmpinfo = $el_host_field;
$tmpinfo =~ s/"/'/g;
push @host_xml, "$field=\"$tmpinfo\" ";
if ( $el_host_field ne $cs_host_field ) {
$data_change = 1;
}
}
## Check each condition that might require an update to the database status.
foreach my $field (qw(
Comments
CurrentNotificationNumber
LastNotificationTime
MaxAttempts
StateType
isAcknowledged
isChecksEnabled
isEventHandlersEnabled
isFlapDetectionEnabled
isNotificationsEnabled
isPassiveChecksEnabled
), @non_default_host_data_change) {
$el_host_field = $el_host->{$field}; $el_host_field = '' if not defined $el_host_field;
$cs_host_field = $cs_host->{$field}; $cs_host_field = '' if not defined $cs_host_field;
if ( $el_host_field ne $cs_host_field ) {
my $tmpinfo = $el_host_field;
$tmpinfo =~ s/"/'/g;
push @host_xml, "$field=\"$tmpinfo\" ";
$data_change = 1;
}
}
my $timing_change = 0;
## Check each condition that might require an update to the timing change fields
## (sync only on heartbeat, or if other data has changed).
foreach my $field (qw(
ExecutionTime
Latency
LastCheckTime
NextCheckTime
PercentStateChange
CurrentAttempt
LastPluginOutput
), @non_default_host_timing_change) {
$el_host_field = $el_host->{$field}; $el_host_field = '' if not defined $el_host_field;
$cs_host_field = $cs_host->{$field}; $cs_host_field = '' if not defined $cs_host_field;
if ( $el_host_field ne $cs_host_field ) {
my $tmpinfo = $el_host_field;
$tmpinfo =~ s/"/'/g;
push @host_xml, "$field=\"$tmpinfo\" ";
$timing_change = 1;
}
}
if ( ( $timing_change == 1 ) && ( $data_change == 0 ) ) {
if ($heartbeat_mode || $el_host->{StateType} eq 'SOFT') {
# We may push host state changes to remote servers even if we're not in heartbeat mode,
# so the parent Nagios has a chance to clock its SOFT-to-HARD state machine.
push_host_state_change( $hostkey, $el_host, $state_changes ) if defined $state_changes;
}
if ($heartbeat_mode) {
print LOG "Accepting heartbeat change for host: $hostkey\n" if $debug_basic;
return join( '', @host_xml );
}
else {
print LOG "Rejecting change since it's just a timing update and we are not doing a heartbeat: $hostkey\n" if $debug_basic;
return $no_xml;
}
}
if ( $data_change == 1 ) {
## Check for "Pending Transition", so we can send an event and trigger a state change
## when we go from PENDING to UP
if ( ( $el_host->{MonitorStatus} eq 'UP' ) and ( $cs_host->{MonitorStatus} ) eq 'PENDING' ) {
my $queueing_status = queue_pending_host_event( $el_host, $hostkey );
return $restart_xml if $queueing_status != CONTINUE_STATUS;
}
print LOG Data::Dumper->Dump([\%{$cs_host}], [qw(\%{cs_hosts})]) if $debug_ridiculous;
print LOG Data::Dumper->Dump([\%{$el_host}], [qw(\%{el_hosts})]) if $debug_ridiculous;
print LOG "State changed for $hostkey -- should tell Foundation now\n" if $debug_basic;
push_host_state_change( $hostkey, $el_host, $state_changes ) if defined $state_changes;
return join( '', @host_xml );
}
return $no_xml;
}
sub serviceStatusChangeXML {
my $el_svcs = shift;
my $cs_svcs = shift;
my $hostkey = shift;
my $servicekey = shift;
my $state_changes = shift; # arrayref or undef
my @service_xml = ();
my $el_svc = $el_svcs->{$servicekey};
my $cs_svc = $cs_svcs->{$servicekey};
my $data_change = 0;
my $el_svc_field;
my $cs_svc_field;
# We always need these fields if we send any XML (GWMON-7684) ...
foreach my $field qw(
MonitorStatus
ScheduledDowntimeDepth
LastStateChange
) {
$el_svc_field = $el_svc->{$field}; $el_svc_field = '' if not defined $el_svc_field;
$cs_svc_field = $cs_svc->{$field}; $cs_svc_field = '' if not defined $cs_svc_field;
my $tmpinfo = $el_svc_field;
$tmpinfo =~ s/"/'/g;
push @service_xml, "$field=\"$tmpinfo\" ";
# but don't miss a change to these ...
if ( $el_svc_field ne $cs_svc_field ) {
$data_change = 1;
}
}
## Check each condition that might require an update to the database status.
foreach my $field (qw(
Comments
CurrentNotificationNumber
LastNotificationTime
isAcceptPassiveChecks
isChecksEnabled
isEventHandlersEnabled
isFlapDetectionEnabled
isNotificationsEnabled
isProblemAcknowledged
MaxAttempts
StateType
), @non_default_service_data_change) {
$el_svc_field = $el_svc->{$field}; $el_svc_field = '' if not defined $el_svc_field;
$cs_svc_field = $cs_svc->{$field}; $cs_svc_field = '' if not defined $cs_svc_field;
if ( $el_svc_field ne $cs_svc_field ) {
my $tmpinfo = $el_svc_field;
$tmpinfo =~ s/"/'/g;
push @service_xml, "$field=\"$tmpinfo\" ";
$data_change = 1;
}
}
my $timing_change = 0;
## Check fields that constitute a timing update (sync only on heartbeat, or if other data has changed).
foreach my $field (qw(
LastCheckTime
NextCheckTime
Latency
ExecutionTime
PercentStateChange
CurrentAttempt
LastPluginOutput
), @non_default_service_timing_change) {
$el_svc_field = $el_svc->{$field}; $el_svc_field = '' if not defined $el_svc_field;
$cs_svc_field = $cs_svc->{$field}; $cs_svc_field = '' if not defined $cs_svc_field;
if ( $el_svc_field ne $cs_svc_field ) {
my $tmpinfo = $el_svc_field;
$tmpinfo =~ s/"/'/g;
push @service_xml, "$field=\"$tmpinfo\" ";
$timing_change = 1;
}
}
if ( ($timing_change == 1) && ($data_change == 0) ) {
if ($heartbeat_mode || $el_svc->{StateType} eq 'SOFT') {
# We may push service state changes to remote servers even if we're not in heartbeat mode,
# so the parent Nagios has a chance to clock its SOFT-to-HARD state machine.
push_service_state_change( $hostkey, $servicekey, $el_svc, $state_changes ) if defined $state_changes;
}
if ($heartbeat_mode) {
print LOG "Accepting heartbeat change for host: $hostkey and service $servicekey\n" if $debug_basic;
return join( '', @service_xml );
} else {
print LOG "Rejecting change since it's just a timing update and we are not doing a heartbeat: $servicekey\n" if $debug_basic;
return $no_xml;
}
}
if ($data_change == 1) {
## Check for "Pending Transition", so we can send an event and trigger a state change
## when we go from Pending to OK
if ( ( $el_svc->{MonitorStatus} eq 'OK' ) and ( $cs_svc->{MonitorStatus} ) eq 'PENDING' ) {
my $queueing_status = queue_pending_svc_event( $el_svc, $hostkey, $servicekey );
return $restart_xml if $queueing_status != CONTINUE_STATUS;
}
if ( $debug_debug ) {
print LOG "Found changed $servicekey\n";
print LOG Data::Dumper->Dump( [ \%{$cs_svc} ], [qw(\%{cs_svcs})] );
print LOG Data::Dumper->Dump( [ \%{$el_svc} ], [qw(\%{el_svcs})] );
}
push_service_state_change( $hostkey, $servicekey, $el_svc, $state_changes ) if defined $state_changes;
return join( '', @service_xml );
}
return $no_xml;
}
sub queue_pending_host_event {
## This subroutine sends an event in the rare case where the host has transitioned from PENDING to UP.
## Nagios does not recognize this as an event, but we want it in Foundation so we are detecting and
## sending it here. After initial script startup, when a lot of these might be found, there is not much
## point in bundling these, as they will trickle in based on the scheduler, and should only occur after
## hosts are added.
my $el_host = shift;
my $hostkey = shift;
# Bail if events are off.
if (not $send_events_for_pending_to_ok) {
return CONTINUE_STATUS;
}
my @xml_message = ();
push @xml_message, '{LastPluginOutput};
$tmp =~ s/\n/ /g;
$tmp =~ s/
/ /ig;
$tmp =~ s/&/&/g;
$tmp =~ s/"/"/g;
$tmp =~ s/'/'/g;
$tmp =~ s/</g;
$tmp =~ s/>/>/g;
push @xml_message, "TextMessage=\"$tmp\" ";
$tmp = time_text(time);
push @xml_message, "ReportDate=\"$tmp\" ";
push @xml_message, "SubComponent=\"$hostkey\" ";
push @xml_message, "LastInsertDate=\"$el_host->{LastCheckTime}\" ";
push @xml_message, 'ErrorType="HOST ALERT" ';
push @xml_message, '/>';
my $xml_message = join( '', @xml_message );
print LOG "Pending Transition Host Event:\n$xml_message\n" if $debug_xml;
push @event_messages, $xml_message;
$message_counter = send_pending_events( $message_counter, $max_event_bundle_size );
return ($message_counter < 0) ? RESTART_STATUS : CONTINUE_STATUS;
}
sub queue_pending_svc_event {
## This subroutine sends an event in the rare case where the service has transitioned from PENDING to OK.
## Nagios does not recognize this as an event, but we want it in Foundation so we are detecting and
## sending it here. After initial script startup, when a lot of these might be found, there is not much
## point in bundling these, as they will trickle in based on the scheduler, and should only occur after
## services are added.
my $el_svc = shift;
my $hostkey = shift;
my $servicekey = shift;
# Bail if events are off.
if (not $send_events_for_pending_to_ok) {
return CONTINUE_STATUS;
}
my @xml_message = ();
push @xml_message, '{MonitorStatus}\" ";
push @xml_message, "MonitorStatus=\"$el_svc->{MonitorStatus}\" ";
my $tmp = $el_svc->{LastPluginOutput};
$tmp =~ s/\n/ /g;
$tmp =~ s/
/ /ig;
$tmp =~ s/&/&/g;
$tmp =~ s/"/"/g;
$tmp =~ s/'/'/g;
$tmp =~ s/</g;
$tmp =~ s/>/>/g;
push @xml_message, "TextMessage=\"$tmp\" ";
$tmp = time_text(time);
push @xml_message, "ReportDate=\"$tmp\" ";
push @xml_message, "LastInsertDate=\"$el_svc->{LastCheckTime}\" ";
push @xml_message, "SubComponent=\"$hostkey:$servicekey\" ";
push @xml_message, 'ErrorType="SERVICE ALERT" ';
push @xml_message, '/>';
my $xml_message = join( '', @xml_message );
print LOG "Pending Transition Service Event:\n$xml_message\n" if $debug_xml;
push @event_messages, $xml_message;
$message_counter = send_pending_events( $message_counter, $max_event_bundle_size );
return ($message_counter < 0) ? RESTART_STATUS : CONTINUE_STATUS;
}
sub hostStatusUpdate {
my $element_ref = shift;
my $collage_ref = shift;
my $hostkey = shift;
my $el_host = $element_ref->{Host}->{$hostkey};
my $cs_host = \%{ $collage_ref->{Host}->{$hostkey} };
#$cs_host = $el_host;
$cs_host->{Comments} = $el_host->{Comments};
$cs_host->{CurrentAttempt} = $el_host->{CurrentAttempt};
$cs_host->{CurrentNotificationNumber} = $el_host->{CurrentNotificationNumber};
$cs_host->{LastNotificationTime} = $el_host->{LastNotificationTime};
$cs_host->{ExecutionTime} = $el_host->{ExecutionTime};
$cs_host->{LastCheckTime} = $el_host->{LastCheckTime};
$cs_host->{Latency} = $el_host->{Latency};
$cs_host->{MaxAttempts} = $el_host->{MaxAttempts};
$cs_host->{MonitorStatus} = $el_host->{MonitorStatus};
$cs_host->{NextCheckTime} = $el_host->{NextCheckTime};
$cs_host->{ScheduledDowntimeDepth} = $el_host->{ScheduledDowntimeDepth};
$cs_host->{StateType} = $el_host->{StateType};
$cs_host->{isAcknowledged} = $el_host->{isAcknowledged};
$cs_host->{isChecksEnabled} = $el_host->{isChecksEnabled};
$cs_host->{isEventHandlersEnabled} = $el_host->{isEventHandlersEnabled};
$cs_host->{isFlapDetectionEnabled} = $el_host->{isFlapDetectionEnabled};
# $cs_host->{isHostFlapping} = $el_host->{isHostFlapping};
$cs_host->{isNotificationsEnabled} = $el_host->{isNotificationsEnabled};
# $cs_host->{isObsessOverHost} = $el_host->{isObsessOverHost};
$cs_host->{isPassiveChecksEnabled} = $el_host->{isPassiveChecksEnabled};
$cs_host->{LastPluginOutput} = $el_host->{LastPluginOutput};
$cs_host->{PercentStateChange} = $el_host->{PercentStateChange};
$cs_host->{LastStateChange} = $el_host->{LastStateChange};
return;
}
sub serviceStatusUpdate {
my $element_ref = shift;
my $collage_ref = shift;
my $hostkey = shift;
my $servicekey = shift;
my $el_svc = $element_ref->{Host}->{$hostkey}->{Service}->{$servicekey};
my $cs_svc = \%{ $collage_ref->{Host}->{$hostkey}->{Service}->{$servicekey} };
# $cs_svc = $el_svc;
$cs_svc->{Comments} = $el_svc->{Comments};
$cs_svc->{CurrentAttempt} = $el_svc->{CurrentAttempt};
$cs_svc->{CurrentNotificationNumber} = $el_svc->{CurrentNotificationNumber};
$cs_svc->{LastNotificationTime} = $el_svc->{LastNotificationTime};
$cs_svc->{LastCheckTime} = $el_svc->{LastCheckTime};
$cs_svc->{MonitorStatus} = $el_svc->{MonitorStatus};
$cs_svc->{NextCheckTime} = $el_svc->{NextCheckTime};
$cs_svc->{ScheduledDowntimeDepth} = $el_svc->{ScheduledDowntimeDepth};
$cs_svc->{isAcceptPassiveChecks} = $el_svc->{isAcceptPassiveChecks};
$cs_svc->{isChecksEnabled} = $el_svc->{isChecksEnabled};
$cs_svc->{isEventHandlersEnabled} = $el_svc->{isEventHandlersEnabled};
$cs_svc->{isFlapDetectionEnabled} = $el_svc->{isFlapDetectionEnabled};
$cs_svc->{isNotificationsEnabled} = $el_svc->{isNotificationsEnabled};
# $cs_svc->{isObsessOverService} = $el_svc->{isObsessOverService};
$cs_svc->{isProblemAcknowledged} = $el_svc->{isProblemAcknowledged};
# $cs_svc->{isServiceFlapping} = $el_svc->{isServiceFlapping};
$cs_svc->{MaxAttempts} = $el_svc->{MaxAttempts};
$cs_svc->{PercentStateChange} = $el_svc->{PercentStateChange};
$cs_svc->{LastPluginOutput} = $el_svc->{LastPluginOutput};
$cs_svc->{Latency} = $el_svc->{Latency};
$cs_svc->{ExecutionTime} = $el_svc->{ExecutionTime};
$cs_svc->{LastStateChange} = $el_svc->{LastStateChange};
$cs_svc->{StateType} = $el_svc->{StateType};
return;
}
sub find_deltas {
my $element_ref = shift;
my $collage_status_ref = shift;
my $deltas = {};
foreach my $hostkey ( keys( %{ $collage_status_ref->{Host} } ) ) {
my $el_host = $element_ref->{Host}->{$hostkey};
if ( !defined $el_host ) {
$deltas->{FoundationHost}->{$hostkey} = 1;
next;
}
foreach my $servicekey ( keys( %{ $collage_status_ref->{Host}->{$hostkey}->{Service} } ) ) {
my $el_svc = $el_host->{Service}->{$servicekey};
if ( !defined $el_svc ) {
$deltas->{FoundationHost}->{$hostkey}->{Service}->{$servicekey} = 1;
}
}
}
foreach my $hostkey ( keys( %{ $element_ref->{Host} } ) ) {
my $cs_host = $collage_status_ref->{Host}->{$hostkey};
if ( !defined $cs_host ) {
$deltas->{NagiosHost}->{$hostkey} = 1;
next;
}
foreach my $servicekey ( keys( %{ $element_ref->{Host}->{$hostkey}->{Service} } ) ) {
if ( !defined $cs_host->{Service}->{$servicekey} ) {
$deltas->{NagiosHost}->{$hostkey}->{Service}->{$servicekey} = 1;
}
}
}
return $deltas;
}
sub assemble_remote_full_dump {
my $collage_status_ref = shift;
my @states = ();
my $cs_host = undef;
my $cs_serv = undef;
my $cs_status = undef;
my $check_state = undef;
my $cs_hosts = $collage_status_ref->{Host};
my $cs_services = undef;
my $host_text = undef;
my $service_text = undef;
$#states = $heartbeat_high_water_mark; # pre-extend the array, for efficiency
$#states = -1; # truncate the array, since we don't have any messages yet
foreach my $host ( keys( %{$cs_hosts} ) ) {
$cs_host = $cs_hosts->{$host};
$cs_status = $cs_host->{MonitorStatus};
if ( $cs_status =~ /UP/ ) {
$check_state = 0;
}
elsif ( $cs_status =~ /PENDING/ ) {
next;
}
else {
$check_state = 1;
}
## Reverse the XML Substitution needed for Foundation in the status text.
$host_text = $cs_host->{LastPluginOutput};
$host_text =~ s/>/>/g;
$host_text =~ s/</{Service};
foreach my $service ( keys( %{$cs_services} ) ) {
$cs_serv = $cs_services->{$service};
$cs_status = $cs_serv->{MonitorStatus};
if ( $cs_status =~ /PENDING/ ) {
next;
}
elsif ( $cs_status =~ /OK/ ) {
$check_state = 0;
}
elsif ( $cs_status =~ /WARNING/ ) {
$check_state = 1;
}
elsif ( $cs_status =~ /CRITICAL/ ) {
$check_state = 2;
}
else {
$check_state = 3;
}
$service_text = $cs_serv->{LastPluginOutput};
$service_text =~ s/>/>/g;
$service_text =~ s/</{Host} };
my $cs_hosts = \%{ $collage_ref->{Host} };
my $el_services = undef;
my $cs_services = undef;
my $host_text = undef;
my $service_text = undef;
$#states = $state_change_high_water_mark; # pre-extend the array, for efficiency
$#states = -1; # truncate the array, since we don't have any messages yet
foreach my $host ( keys( %{$el_hosts} ) ) {
$el_host = \%{ $el_hosts->{$host} };
$cs_host = \%{ $cs_hosts->{$host} };
$el_status = $el_host->{MonitorStatus};
$cs_status = $cs_host->{MonitorStatus};
if ( $el_status ne $cs_status ) {
if ( $el_status =~ /UP/ ) {
$check_state = 0;
}
elsif ( $el_status =~ /PENDING/ ) {
next;
}
else {
$check_state = 1;
}
## Reverse the XML Substitution needed for Foundation in the status text.
$host_text = $el_host->{LastPluginOutput};
$host_text =~ s/>/>/g;
$host_text =~ s/</{Service} };
$cs_services = \%{ $cs_host->{Service} };
foreach my $service ( keys( %{$el_services} ) ) {
$el_serv = \%{ $el_services->{$service} };
$cs_serv = \%{ $cs_services->{$service} };
$el_status = $el_serv->{MonitorStatus};
$cs_status = $cs_serv->{MonitorStatus};
if ( $el_status ne $cs_status ) {
if ( $el_status =~ /PENDING/ ) {
next;
}
elsif ( $el_status =~ /OK/ ) {
$check_state = 0;
}
elsif ( $el_status =~ /WARNING/ ) {
$check_state = 1;
}
elsif ( $el_status =~ /CRITICAL/ ) {
$check_state = 2;
}
else {
$check_state = 3;
}
$service_text = $el_serv->{LastPluginOutput};
$service_text =~ s/>/>/g;
$service_text =~ s/</ $last_index;
# We use an array slice here to avoid a lot of expensive and pointless copying into a second array.
# We concatenate all the messages in the slice to avoid a lot of individual system calls within the
# print statement, as print will make a separate call for each list element provided.
$message_set = join( '', @$messages[ $first .. $last ] );
open NSCA, '|-', "$send_nsca_command >> $logfile";
print NSCA $message_set;
$failed |= !close NSCA;
if ($send_to_secondary_NSCA) {
open NSCA, '|-', "$secondary_send_nsca_command >> $logfile";
print NSCA $message_set;
$failed |= !close NSCA;
}
sleep $nsca_batch_delay if $last < $last_index;
}
return !$failed;
}
sub gdma_spool {
my $gdma_results = shift; # arrayref to possibly-empty list of previously-failed-to-spool messages
my $commands = shift; # arrayref to list of new messages to spool
## Prepend to each result the overhead info needed by the GDMA spooler, before spooling it.
my $default_retries = 0;
my $default_target = 0; # "0" implies that the result is to be sent to all the primary targets.
my $now = time();
my $prefix = join( '', $default_retries, "\t", $default_target, "\t", $now, "\t" );
push @$gdma_results, map { $prefix . $_ } @$commands;
# Flush the data out to the spool file immediately.
# We make this a non-blocking call, as we don't want to block for too long.
# If the spooling doesn't work, the prepared results will be left in place
# (in @$gdma_results) and can/should be passed back here on the next call.
my $blocking = 0;
my $spooled_result_count;
my $errstr;
if ( GDMAUtils::spool_results( $gdma_spool_filename, $gdma_results, $blocking, \$spooled_result_count, \$errstr ) ) {
@$gdma_results = ();
}
else {
## Spooling failed, but the results to spool are still there in the @$gdma_results array.
## Hopefully, they will be spooled at a later time.
log_timed_message "ERROR: GDMA spooling: $errstr";
## Safety valve: prevent an infinite growth of accumulating as-yet-unspooled results.
my $results_to_discard = @$gdma_results - $max_unspooled_results_to_save;
if ($results_to_discard > 0) {
log_timed_message "NOTICE: GDMA spooling: discarding $results_to_discard results";
splice @$gdma_results, 0, $results_to_discard;
}
}
}
__END__
NAGIOS V1 STATUS.LOG FILE
All Host Lines:
[Time of last update] HOST;
Host Name (string);
Status (OK/DOWN/UNREACHABLE);
Last Check Time (long time);
Last State Change (long time);
Acknowledged (0/1);
Time Up (long time);
Time Down (long time);
Time Unreachable (long time);
Last Notification Time (long time);
Current Notification Number (#);
Notifications Enabled (0/1);
Event Handlers Enabled (0/1);
Checks Enabled (0/1);
Flap Detection Enabled (0/1);
Host is Flapping (0/1);
Percent State Change (###.##);
Scheduled downtime depth (#);
Failure Prediction Enabled (0/1);
Process Performance Data(0/1);
Plugin Output (string)
Service Lines:
[Time of last update] SERVICE;
Host Name (string);
Service Description (string);
Status (OK/WARNING/CRITICAL/UNKNOWN);
Retry number (#/#);
State Type (SOFT/HARD);
Last check time (long time);
Next check time (long time);
Check type (ACTIVE/PASSIVE);
Checks enabled (0/1);
Accept Passive Checks (0/1);
Event Handlers Enabled (0/1);
Last state change (long time);
Problem acknowledged (0/1);
Last Hard State (OK/WARNING/CRITICAL/UNKNOWN);
Time OK (long time);
Time Unknown (long time);
Time Warning (long time);
Time Critical (long time);
Last Notification Time (long time);
Current Notification Number (#);
Notifications Enabled (0/1);
Latency (#);
Execution Time (#);
Flap Detection Enabled (0/1);
Service is Flapping (0/1);
Percent State Change (###.##);
Scheduled Downtime Depth (#);
Failure Prediction Enabled (0/1);
Process Performance Date (0/1);
Obsess Over Service (0/1);
Plugin Output (string)
Program line (second line of the status log):
[Current Time] PROGRAM;
Program Start Time (long time);
Nagios PID (#);
Daemon Mode (0/1);
Last Command Check (long time);
Last Log Rotation (long time);
Notifications Enabled (0/1);
Execute Service Checks (0/1);
Accept Passive Service Checks (0/1);
Enable Event Handlers (0/1);
Obsess Over Services (0/1);
Enable Flap Detection (0/1);
Enable Failure Prediction (0/1);
Process Performance Data (0/1)
NAGIOS V2 STATUS.DAT FILE
info {
created=1122681331
version=2.0b3
}
program {
modified_host_attributes=0
modified_service_attributes=0
nagios_pid=48776
daemon_mode=1
program_start=1122681286
last_command_check=0
last_log_rotation=0
enable_notifications=1
active_service_checks_enabled=1
passive_service_checks_enabled=1
active_host_checks_enabled=1
passive_host_checks_enabled=1
enable_event_handlers=1
obsess_over_services=0
obsess_over_hosts=0
check_service_freshness=0
check_host_freshness=0
enable_flap_detection=0
enable_failure_prediction=1
process_performance_data=0
global_host_event_handler=
global_service_event_handler=
}
host {
host_name=localhost
modified_attributes=0
check_command=check-host-alive
event_handler=
has_been_checked=1
should_be_scheduled=0
check_execution_time=0.061
check_latency=0.000
current_state=0
last_hard_state=0
check_type=0
plugin_output=PING OK - Packet loss = 0%, RTA = 0.04 ms
performance_data=
last_check=1122681125
next_check=0
current_attempt=1
max_attempts=10
state_type=1
last_state_change=1122681115
last_hard_state_change=1122681115
last_time_up=1122681125
last_time_down=0
last_time_unreachable=0
last_notification=0
next_notification=0
no_more_notifications=0
current_notification_number=0
notifications_enabled=1
problem_has_been_acknowledged=0
acknowledgement_type=0
active_checks_enabled=1
passive_checks_enabled=1
event_handler_enabled=1
flap_detection_enabled=1
failure_prediction_enabled=1
process_performance_data=1
obsess_over_host=1
last_update=1122681331
is_flapping=0
percent_state_change=0.00
scheduled_downtime_depth=0
}
service {
host_name=localhost
service_description=Current Load
modified_attributes=0
check_command=check_local_load!5.0,4.0,3.0!10.0,6.0,4.0
event_handler=
has_been_checked=1
should_be_scheduled=1
check_execution_time=0.008
check_latency=0.539
current_state=0
last_hard_state=0
current_attempt=1
max_attempts=4
state_type=1
last_state_change=1122681115
last_hard_state_change=1122681115
last_time_ok=1122681286
last_time_warning=0
last_time_unknown=0
last_time_critical=0
plugin_output=OK - load average: 0.12, 0.15, 0.21
performance_data=load1=0.123535;5.000000;10.000000;0.000000 load5=0.154785;4.000000;6.000000;0.000000 load15=0.214844;3.000000;4.000000;0.000000
last_check=1122681286
next_check=1122681586
check_type=0
current_notification_number=0
last_notification=0
next_notification=0
no_more_notifications=0
notifications_enabled=1
active_checks_enabled=1
passive_checks_enabled=1
event_handler_enabled=1
problem_has_been_acknowledged=0
acknowledgement_type=0
flap_detection_enabled=1
failure_prediction_enabled=1
process_performance_data=1
obsess_over_service=1
last_update=1122681331
is_flapping=0
percent_state_change=0.00
scheduled_downtime_depth=0
}