#!/bin/perl

###############################################################################
#
#  fault_manager.pl: An application which provides event management for events
#                    from multiple sources
#
#  $fault[0]  = event id number
#  $fault[1]  = host
#  $fault[2]  = applicaiton
#  $fault[3]  = instance
#  $fault[4]  = parameter
#  $fault[5]  = message text
#  $fault[6]  = severity
#  $fault[7]  = event date/time
#  $fault[8]  = host sla start time
#  $fault[9]  = host sla end time
#  $fault[10]  = sla include weekend flag
#  $fault[11] = current time
#  $fault[12] = sla in effect flag
#  $fault[13] = notified flag
#  $fault[14]..[n] = event specific parameters
#
###############################################################################

# Run as a daemon

$pid=fork;
exit if $pid;
die "Couldn't fork: $!" unless defined($pid);

use POSIX;
use DBI;
                
# Change from DBI:Sprite to database of your choice
# To use DBI:Sprite, these databases and tables must first be created
$dbh_reception = DBI->connect('DBI:Sprite:reception','user','password') or warn "Could not connect to reception log database (".$DBI->err.':'.$DBI->errstr.")!";
$dbh_rule_log = DBI->connect('DBI:Sprite:rule_log','user','password') or warn "Could not connect to rule log database (".$DBI->err.':'.$DBI->errstr.")!";

POSIX::setsid() or die "Can't start a new session: $!";

# Set initial values to global variables
InitializeGlobals();

# Set paths to various configuration files
SetPaths();

# Run until signalled to die

$time_to_die=0;

sub signal_handler {
        $time_to_die=1; 
}

$SIG{INT} = $SIG{TERM} = $SIG{HUP} = \&signal_handler;
$SIG{PIPE}='Ignore';

until($time_to_die) {
	opendir(DIR, $dir) or warn "Cannot open $dir\n";
	$event=readdir(DIR);
	while (length($event) > 0) {
		if (($event ne ".") and ($event ne "..")) {
            
     		GetEvent();
			ApplyRules();
     		PushOntoQueue();
     			
     		unlink($event);
		}
		NotifyPending();
		$event=readdir(DIR);
	}
	sleep 10;
}

sub ApplyRules() {
     if ($param[1] eq "AVAILABILITY_CHECK") {
          # Message Manipulation
          Fix_Availability_Check();
     }
     if (index($param[1], "WEB_SERVER") >= 0) {
          # Event Matching
          Fix__Web_Server();
     }
     # Event Filtering by Host
     Check_Maintenance();
     # Event Filtering by Application
     Check_Application();
     # Event Correlations
     if($param[1] eq 'BACKUP') {
     	CorrelateBackups();
     } 
     if(index($param[1], "INET") >= 0) {
     	CorrelateINETChecks();
     }
}

sub Fix_Availabilty_Check() {
	my $LOCAL_RULE_TRACE=1;
	if($CONTINUE){
		# Place holder rule as an example of message text manuipulation

		# Use Perl string operators to change text in this class of message
		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
			Log_Rule_Trace("Fix_Availability_Check modified message text.");
		}
	}
}

sub Fix_Web_Server() {
	my $LOCAL_RULE_TRACE=1;
	my $FOUND_FLAG=0;
	# This  uses a match file to look up the server that host the web site
	# The logical web site is the Instance parameter of the event and is held
	# in $fault[$_fault_instance]
	if($CONTINUE) {
		open(CONFIG, "< $web_list") or warn "Cannot open $web_list: $!\n";
		@lines=<CONFIG>;
		close(CONFIG);
		foreach $line (@lines) {
			chomp($line);
			@temp=split / +/, $line;
			$web_host=$temp[0];
			$web_site=$temp[1];
			if ($web_site eq $fault[$_fault_instance]) {
				# Match found, change the logical web site name in the message
				# text to the physical server name
				$fault[$_fault_msg]=~ s/$fault[$_fault_host]/$web_host/g;
				# Change the host originating the alert to be the host containing the 
				# web site
				$fault[$_fault_host]=$web_host;
				# Toggle the found flag
				$FOUND_FLAG=1;
				if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
             			Log_Rule_Trace("Fix_Web_Server changed the logical web site to the physical server.");
				}
				last;
			}
		}
		if (not $FOUND_FLAG) {
			if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
				Log_Rule_Trace("Fix_Web_Server was unable to find a physical web server for the web site.");
			}
		}
	}
}

sub Check_Maintenance() {
	my $LOCAL_RULE_TRACE=1;
	# Event filtering based on hosts listed in a text configuration file
	# Check host maintenance list to deterimine if this event should be dropped
	
	# Note there is no rule logging if the event is not dropped as this test is
	# applied to all events and logging a negative result would not be informative
	if($CONTINUE) {
		open(CONF, "< $host_list") or warn "Cannot open $host_list: $!\n";
		@lines=<CONF>;
		close(CONF);
		foreach $line (@lines) {
			chomp($line);
			if ($line eq $fault[$_fault_host]) {
				$CONTINUE=0;
				if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
					Log_Rule_Trace("Host $fault[$_fault_host] has monitoring disabled, dropping alert.");
				}
			last;
			}
		}
	}
}

sub Check_Application() {
	my $LOCAL_RULE_TRACE=1;
	my $FOUND_FLAG=0;
	# Event filtering based on event class listed in a text configuration file
	# Check that the class is supposed to be alerted
	
	# Note there is no rule logging if the event is not dropped as this test is
	# applied to all events and logging a negative result would not be informative
	if($CONTINUE) {
		open(CONF, "< $application_list_file") or warn "Cannot open $application_list_file: $!\n";
		@lines=<CONF>;
		close(CONF);
		foreach $line (@lines) {
			chomp($line);
			if ($line eq $fault[$_fault_application]) {
				$FOUND_FLAG=1;
				last;
			}
		}
		if (not $FOUND_FLAG) {
			$CONTINUE=0;
			if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
				Log_Rule_Trace("Application $fault[$_fault_application] is not in the list of applications, dropping alert.");
			}
		}
	}
}

sub CorrelateBackups() {
	# Example of a correlation rule for hosts which have multiple databases
	# that are backed up each night, if one fails, often they all fail (bad tape drive)
	# the correlation reduces the number of alerts
	
	my $LOCAL_RULE_TRACE=1;
	if($CONTINUE) {
     	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     		Log_Rule_Trace("Backup event, checking for correlating events.");
     	}
     	my $existing_count=0;
     	my $CORRELATED=0;
     	my $i=0;
        	while ($i <= $#open_alerts) {
     		if (($open_alerts[$i][$_fault_host] eq $fault[$_fault_host]) and ($open_alerts[$i][$_fault_application] eq 'BACKUP')) {
     			$existing_count++;
     			if (index($open_alerts[$i][$_fault_msg], "Correlation") >= 0) {
     				$CORRELATED=1;
     			}
     		}
     		$i++;
     	}
     	if (($existing_count >= 1) and ($CORRELATED)) {	
     		$CONTINUE=0;
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("Event already correlated, dropping event.");
     		}
     	}
     	if (($existing_count >= 1) and (not $CORRELATED)) {
     		$fault[$_fault_msg]="\"Correlation Alert, multiple backup failures on $fault[$_fault_host].\"";
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("Correlating event and modifying message.");
     		}
     	}
     	if ($existing_count < 1) {
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("No correlating backup event found.");
     		}
     	}
	}
}

sub CorrelateINETChecks() {
	# Another example event correlation rule for monitoring of web services
	# all services start with the prefix 'INET (eg web server is INET_Web_Server, ftp is INT_FTP_Server)
	# when the host goes down, multiple events will be generated, this rule correlates all INET
	# events coming from a single host to reduce number of events
	
	my $LOCAL_RULE_TRACE=1;
	if ($CONTINUE) {
     	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     		Log_Rule_Trace("INET event, checking for correlating events.");
     	}
     	my $existing_count=0;
     	my $CORRELATED=0;
     	my $i=0;
     	while ($i <= $#open_alerts) {
     		if (($open_alerts[$i][$_fault_host] eq $fault[$_fault_host]) and (index($open_alerts[$i][$_fault_application], "INET") >= 0)) {
     			$existing_count++;
     			if (index($open_alerts[$i][$_fault_msg], "Correlation") >= 0) {
     				$CORRELATED=1;
     			}
     		}
     		$i++;
     	}
     	if (($existing_count >= 2) and ($CORRELATED)) {
     		$CONTINUE=0;
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("Event already correlated, dropping event.");
     		}
     	}
     	if (($existing_count >= 2) and (not $CORRELATED)) {
     		$fault[$_fault_msg]="\"Correlation Alert, multiple Internet service (web, ftp, proxy, nntp, smtp, ldap) failures on $fault[$_fault_host].\"";
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("Correlating event and modifying message.");
     		}
     	}
     	if ($existing_count < 2) {
     		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     			Log_Rule_Trace("No correlating INET event found.");
     		}
     	}
	}
}

sub InitializeGlobals() {
     $GLOBAL_RULE_TRACE=0;
	# It would be possible to write a rule which toggles on Global Rule Tracing
	# on for specific event types only
     $CONTINUE=1;
     $Event_ID=1000;
	$_sla_name=0;
	$_sla_start_hour=1;
	$_sla_end_hour=2;
	$_sla_weekend=3;
	$_fault_id=0;
	$_fault_host=1;
	$_fault_application=2;
	$_fault_instance=3;
	$_fault_parameter=4;
	$_fault_msg=5;
	$_fault_severity=6;
	$_fault_datetime=7;
	$_fault_sla_start_hour=8;
	$_fault_sla_end_hour=9;
	$_fault_sla_weekend=10;
	$_fault_current_time=11;
	$_fault_sla_in_effect=12;
	$_fault_notified=13;
}

sub SetPaths() {
	$dir = #ftp_root/alarm';
	$web_list='/opt/bin/alert/web.conf';
	$host_list='/opt/bin/alert/hosts.conf';
	$application_list_file = '/opt/bin/alert/application.conf';
	$config='/opt/bin/alert/fault.conf';
}

sub GetEvent() {
	my $LOCAL_RULE_TRACE=1;
	
	# Read and Build Event
	
	# Reset these global vars for each event...
	$SLA_in_Effect='Y';

	# Read Event
	$event=$dir."/".$event;
	open(ALRT, "< $event") or warn "Cannot open $event: $!\n";
	@event_file_lines=<ALRT>;
	close (ALRT);
	
	Log_Fault_Reception();
	
	# Parse event and assign to fault array elements
	
	# This assumes an incoming alert in the standard format 

	$header=$event_file_lines[0];
	chomp($header);
	
	# If header is not valid, delete the file
	if ($header ne "Event_Manager 1.0") {
		if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
			Log_Rule_Trace("File was not a valid alert, deleting.");
		}
		unlink($event);
		$CONTINUE=0;
		return;
	}
	
	# Set the event id
	$fault[$_fault_id]=$Event_ID++;
     # Set the datetime
     $fault[$_fault_datetime]=$event_file_lines[1];
     chomp($fault[$_fault_datetime]);
     # Set the class, instance and parameter
     $event_info=$event_file_lines[2];
     chomp($event_info);
     @event_pieces=split /\./, $event_info;
     $fault[$_fault_application]=$event_pieces[0];
     $fault[$_fault_instance]=$event_pieces[1];
     $fault[$_fault_parameter]=$event_pieces[2];
     # Set the host
     $fault[$_fault_host]=$event_file_lines[3];
     chomp($fault[$_fault_host]);
     # Set the message text
     $fault[$_fault_msg]=$event_file_lines[4];
	chomp($fault[$_fault_msg]);
	$fault[$_fault_msg]=~ s/'//g;
     $fault[$_fault_msg]="\"$fault[$_fault_msg]\"";
     # Set the severity
     $fault[$_fault_severity]=$event_file_lines[5];
     chomp($fault[$_fault_severity]);
	
	# Different monitoring tools will use different words for severity
     # This next section translates to the standard terms CRITICAL and FATAL
     # and NORMAL
     # The STATE_CHANGE state is BMC Patrol's way saying alert has gone back
     # to normal from an alert state
     if ($fault[$_fault_severity] eq "WARNING") {
     	$fault[$_fault_severity]="CRITICAL";
     } elsif ($fault[$_fault_severity] eq "ALARM") {
     	$fault[$_fault_severity]="FATAL";
     } elsif ($fault[$_fault_severity] eq "STATE_CHANGE"){
     	$fault[$_fault_severity]="NORMAL";
     } else {
     	# Event state is unrecognized
     	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
     		Log_Rule_Trace("Unrecognized event state, discontinuing processing.");
     	}
     	$CONTINUE=0;
		return;
     }
	
	# Service Level Agreement (SLA) Components of event
	# The SLA of the host is looked up to determine if the alert is occuring within
	# the SLA timeframe. This is determined by opening a configuration file and
	# looking up the start and end service hours of the host and whether its
	# service timeframe includes weekends
   
	$SLA_in_Effect='Y';
	$found_flag='N';
	$SLA_Name="none";
	
	open(CONF, "< $config") or warn "Cannot open $config: $!\n";
	@newlines=<CONF>;
	
	# Strip out comment lines, spaces and blank lines
	foreach $line (@newlines) {
		$line=~ s/#.*//;
		$line=~ s/ //; 
		if (not $line eq "\n") {
			push @lines, $line; 
		}
	}
	
	# Take each line in the configuration file that defines host sla 
	# information and split into a two-dimensional array
	@Temp_Host_SLA = grep /Host_SLA:/, @lines;
	foreach $Host_SLA (@Temp_Host_SLA) {
		@temp=split /:/, $Host_SLA;
		shift @temp;
		@items=split /,/, $temp[0];
		push @Host_SLA, [@items]; 
	}
	
	# Search for the host and if found populate the sla information
	# into the event
	for $i (0 .. $#Host_SLA) {
		for $j (4 .. $#{$Host_SLA[$i]}) {
			chomp $Host_SLA[$i][$j];
			if ($Host_SLA[$i][$j] eq $fault[$_fault_host]) {
				$SLA_Name = $Host_SLA[$i][$_sla_name];
				$fault[$_fault_sla_start_hour]=$Host_SLA[$i][$_sla_start_hour];
				$fault[$_fault_sla_end_hour]=$Host_SLA[$i][$_sla_end_hour];
				$fault[$_fault_sla_weekend]=$Host_SLA[$i][$_sla_weekend];
				$fault[$_fault_current_time]=time;
				# Check if the host is currently in the sla timeframe
				SLAinEffect();
				$fault[$_fault_sla_in_effect]=$SLA_in_Effect;
				$fault[$_fault_notified]='N';
				$found_flag='Y';
			}
		}
	}
	
	# Host not found in config file, by default treat it as having a service
	# level of all day, everyday
	#
	# Some clients may change this to drop the event if the host is not found
	# so that only fully configured hosts can generate alerts
	if ($found_flag eq 'N') {
		$fault[$_fault_sla_start_hour]=0;
		$fault[$_fault_sla_end_hour]=24;
		$fault[$_fault_sla_weekend]='Y';
		$fault[$_fault_current_time]=time;
		$fault[$_fault_sla_in_effect]='Y';
		$fault[$_fault_notified]='N';
	}
	
	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
		Log_Rule_Trace("Fault on $fault[$_fault_host] of application $fault[$_fault_application] of instance $fault[$_fault_instance].");
	}
}

sub Notify() {   
	my $LOCAL_RULE_TRACE=1;
	# This shells out in a blocking manner to a notification application
	# It could also integrate directly with a problem management application if it
	# handled notifications
	
	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
		Log_Rule_Trace("Commencing notification for event# $fault[$_fault_id].");
	}
	$return=system("/opt/bin/alert/notify.pl @fault");
	
	if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
		Log_Rule_Trace("Notification complete - $return.");
	}
}

sub PushOntoQueue() {
	my $LOCAL_RULE_TRACE=1;
	if($CONTINUE) {
		if ($fault[$_fault_severity] eq "NORMAL") {
			if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
				Log_Rule_Trace("Normal state alert received, check to see if fault event on notification queue.");
			}
			RemoveFromArray();
		} else {
			if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
				Log_Rule_Trace("Putting alert onto notification queue.");
			}
			AddtoArray();
		}
	}	
	@fault=();
}

sub AddtoArray() {
	# Maintenance of an open event buffer to evaluate state manaement and correlation
	# events with
	push @open_alerts, [@fault];
}

sub NotifyPending() {
	my $LOCAL_RULE_TRACE=1;
	#Loop through all events
	my $i=0;
	while ($i <= $#open_alerts) {
		#If event = notified and time > 1 hour, delete from array
		my $current_time=time;
		@fault=@{@open_alerts[$i]};
		if (($current_time - $fault[$_fault_current_time] > 900) and ($fault[$_fault_notified] eq "Y")) {
			splice @open_alerts, $i, 1;
			if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
				Log_Rule_Trace("Dropping event from queue due to time expiry.");
			}
			next;
		}
		#For all non-notified events
		if ($fault[$_fault_notified] eq "N") {
			# If the SLA is in effect then the fault should be notified
			if ($fault[$_fault_sla_in_effect] eq "Y") {			
				Notify();
				# Update the version on the queue that notification has occurred
				$open_alerts[$i][$_fault_sla_in_effect]="Y"
			} else {
				# Re-check to see if the SLA has now come into effect
				SLAinEffect();
				$open_alerts[$i][$_fault_sla_in_effect]=$SLA_in_Effect;
				if ($open_alerts[$i][$_fault_sla_in_effect] eq "Y") {				
					if(($GLOBAL_RULE_TRACE) or ($LOCAL_RULE_TRACE)) {
						Log_Rule_Trace("Alert has now moved into SLA, dispatching for notification.");
					}
					Notify();
					# Update the version on the queue that notification has occurred
					$open_alerts[$i][$_fault_sla_in_effect]="Y"
				}
			}
		}
	} continue {
		$i++;
	}
}

sub SLAinEffect() {
	# Test to see if the SLA is in effect now
	my $Current_Hour=(localtime)[2];
	my $Current_Day_of_Week=(localtime)[6];
	my $Weekend='';
	
	# Set whether it is the weekend or not
	if (($Current_Day_of_Week==0) or ($Current_Day_of_Week==6)) {
		$Weekend = 'Y';
	} else {
		$Weekend = 'N';
	}
	
	# SLA's have a start hour and an end hour, need to handle cases
	# where start hour < end hour (ie 8 to 17) as well as
	# where start hour > end hour (ie 24 to 2)
	if ($fault[$_fault_start_hour] < $fault[$_fault_end_hour]) {
		if (($fault[$_fault_start_hour] <= $Current_Hour) and ($Current_Hour < $fault[$_fault_end_hour])) {
			if ($Weekend eq 'Y') {
				if ($fault[$_fault_weekend] eq 'Y') {
					$SLA_in_Effect='Y';
				} else {
					$SLA_in_Effect='N';
				}
			} else {
				$SLA_in_Effect='Y';
			}
		} else {
			$SLA_in_Effect='N';
		}	
	} else {
		if (($Current_Hour < $fault[$_fault_start_hour]) and ($fault[$_fault_end_hour] <= $Current_Hour)) {
			$SLA_in_Effect='N';
		} else {
			if ($Weekend eq 'Y') {
				if ($fault[$_fault_weekend] eq 'Y') {
					$SLA_in_Effect='Y';
				} else {
					$SLA_in_Effect='N';
				}
			} else {
				$SLA_in_Effect='Y';
			}
		}
	}
}

sub RemoveFromArray() {
	my $LOCAL_RULE_TRACE=0;
	# Note - rule tracing within this event is different, it determined solely
	# by the local rule trace variable and is off by default.
	# This is because the information logged can generate a large number of lines
	
	#Loop through array
	my $FOUND_FLAG=0;
	my $i=0;
	while ($i <= $#open_alerts) {
		# Sanity check against empty elements
		if ($open_alerts[$i][$_fault_id] eq "") {
			splice @open_alerts, $i, 1;
			if($LOCAL_RULE_TRACE) {
				Log_Rule_Trace("Removing a blank entry, there are now $#open_alerts in the queue.");
			}
			next;
		}
		# All key elements match
		if($LOCAL_RULE_TRACE) {
			Log_Rule_Trace("Checking $fault[$_fault_host] with $open_alerts[$i][$_fault_host].");
		}
		if ($fault[$_fault_host] eq $open_alerts[$i][$_fault_host]) {
			if($LOCAL_RULE_TRACE) {
				Log_Rule_Trace("Checking $fault[$_fault_application] with $open_alerts[$i][$_fault_application].");
			}
			if ($fault[$_fault_application] eq $open_alerts[$i][$_fault_application]) {
				if($LOCAL_RULE_TRACE) {
					Log_Rule_Trace("Checking $fault[$_fault_instance] with $open_alerts[$i][$_fault_instance].");
				}
				if ($fault[$_fault_instance] eq $open_alerts[$i][$_fault_instance]) {
					splice @open_alerts, $i, 1;
					$lines--;
					if($LOCAL_RULE_TRACE) {
						Log_Rule_Trace("Found corresponding event and removed it, there are now $#open_alerts in the queue.");
					}
					$FOUND_FLAG=1;
					last;
				}
			}
		}
	} continue {
		$i++;
	}	 
	if (not $FOUND_FLAG) {
		if($LOCAL_RULE_TRACE) {
			Log_Rule_Trace("No match found.");
		}
	}
}

sub Log_Fault_Reception() {
	# This logs the event in the reception log
	$dbh_reception->{'RaiseError'} = 1;
     $@ = '';
     eval {
       my($sql) = "Insert into Reception Values ($fault($_fault_id), localtime, $fault($_fault_host), $fault($_fault_application, $fault($_fault_instance), $fault($_fault_parameter), $fault($_fault_msg), $fault($_fault_severity), $fault($_fault_datetime))";
       $dbh_reception->do($sql);
     };
     if ($@) { warn "SQL database error: $@"; }
}

sub Log_Rule_Trace {
	# This is the rule tracing output sub
	my($line)= $_[0];
	
     $dbh_rule_log->{'RaiseError'} = 1;
     $@ = '';
     eval {
       my($sql) = "Insert into Rule_log Values ($fault($_fault_id), localtime, $line)";
       $dbh_rule_log->do($sql);
     };
     if ($@) { warn "SQL database error: $@"; }
}	