#!/usr/local/bin/perl

=head1 NAME

check_drbd - Nagios plugin for DRBD

=head1 SYNOPSIS

B<check_drbd> [B<--verbose> | B<-v>]

=head1 DESCRIPTION

B<check_drbd> is a Nagios plugin for DRBD. It checks the connection state,
resource roles and disk states for every configured DRBD resource, and
produces a WARNING or CRITICAL alert if anything is amiss. The states
of both the local and remote sides of each connection are monitored.

=head2 Nagios status information

The status information emitted by this plugin is similar to the information
in F</proc/drbd>:

    drbd0: Connected Primary/Secondary UpToDate/UpToDate
    |      |         |       |         |        |
    |      |         |       |         |        Remote disk state
    |      |         |       |         Local disk state
    |      |         |       Remote resource role
    |      |         Local resource role
    |      Connection state
    DRBD device

If more than one device is present, and all devices are OK, the output is
summarised:

    drbd0: PriConUpT, drbd1: SecConUpT

If any devices are not OK, the output contains their statuses in full.

=head2 Nagios performance data

Complete performance data is emitted for all configured DRBD resources:

=over

=item drbdI<*>_ns

=item drbdI<*>_nr

The volume of network data sent to and received from the peer, in kiB.

=item drbdI<*>_dw

=item drbdI<*>_dr

The volume of network data written to and read from the local disk, in kiB.

=item drbdI<*>_al

The number of updates of the activity log area of the metadata.

=item drbdI<*>_lo

The number of open requests to the local I/O subsystem issued by DRBD.

=item drbdI<*>_pe

The number of requests sent to the peer but not yet been answered by the latter.

=item drbdI<*>_ua

The number of requests received by the peer but not yet been answered by the latter.

=item drbdI<*>_ap

The number of block I/O requests forwarded by DRBD, but not yet answered by DRBD.

=item drbdI<*>_ep

The number of epoch objects.

=item drbdI<*>_oos

The amount of storage currently out-of-sync, in kiB.

=back

=head1 OPTIONS

=over

=item B<-v>, B<--verbose>

Increase the verbosity of the output messages. This disables the Nagios status
information summarisation described above: all resources' statuses are printed
in full.

=back

=head1 EXIT STATUS

=over

=item 0

All resources are OK.

=item 1

Some resources are not OK, but do not need immediate attention.

=item 2

Some resources are not OK and need immediate attention.

=item 3

An error occurred while collecting the resources' statuses.

=back

=head1 FILES

F</proc/drbd>

=head1 SEE ALSO

L<The DRBD Home Page|http://www.drbd.org/>

=cut

use strict;
use warnings;

use constant BASENAME => ($0 =~ m{.*/([^/]+)})[0] || 'check_drbd';

use constant STATE_FILE => '/proc/drbd';

use constant {
	OK       => 0,
	WARNING  => 1,
	CRITICAL => 2,
	UNKNOWN  => 3,
};

use Getopt::Long;
use IO::File;

sub help;
sub usage;

sub perfdata;
sub ok;
sub warning;
sub critical;
sub unknown;

sub get_state;

$SIG{__DIE__} = sub {
	die @_ if $^S;
	print @_;
	exit UNKNOWN;
};

my $verbose;

Getopt::Long::Configure('bundling', 'no_ignore_case');
GetOptions(
	'verbose|v+' => \$verbose,
	'help|?'     => sub { help; exit 0 },
	'usage'      => sub { usage; exit 0 },
) and @ARGV == 0
	or do { usage; exit UNKNOWN };

my @state = get_state;
my $status = OK;

print "<<<drbd>>>\n";

foreach my $id (0 .. $#state) {
	my $device = $state[$id]
		or next;

	# Assume CRITICAL by default

	foreach (qw( cs )) {
		$device->{"${_}_level"} = {
			Connected     => OK,
			Unconfigured  => OK,
			StandAlone    => WARNING,
			SyncingAll    => WARNING,
			SyncingQuick  => WARNING,
			SyncSource    => WARNING,
			SyncTarget    => WARNING,
			VerifyS       => WARNING,
			VerifyT       => WARNING,
			Disconnecting => WARNING,
			TearDown      => WARNING,
			StartingSyncS => WARNING,
			StartingSyncT => WARNING,
			WFSyncUUID    => WARNING,
		}->{$device->{$_}};
		$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};

		if ($device->{oos}) {
			$device->{oos_level} = {
				StartingSyncS => OK,
				StartingSyncT => OK,
				SyncSource    => OK,
				SyncTarget    => OK,
				PausedSyncS   => OK,
				PausedSyncT   => OK,
			}->{$device->{$_}};
			$device->{oos_level} = CRITICAL unless defined $device->{oos_level};
		}
	}

	foreach (qw( ro pro )) {
		$device->{"${_}_level"} = {
			Primary   => OK,
			Secondary => OK,
		}->{$device->{$_}};
		$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};
	}

	foreach (qw( ds pds )) {
		$device->{"${_}_level"} = {
			UpToDate    => OK,
			Consistent  => OK,
			Negotiating => WARNING,
			Attaching   => WARNING,
		}->{$device->{$_}};
		$device->{"${_}_level"} = CRITICAL unless defined $device->{"${_}_level"};
	}

	my @extra;
	if ($device->{oos}) {
		push @extra, sprintf '%d kiB out-of-sync', $device->{oos};
	}
	if ($device->{iof} !~ /^r.--(.(-)?)?$/) {
		$device->{iof_level} = CRITICAL;
		push @extra, sprintf 'I/O flags: %s', $device->{iof};
	}
	my $extra = @extra ? sprintf(' (%s)', join ', ', @extra) : '';

	my $level = OK;
	foreach (grep /_level$/, keys %$device) {
		$level = $device->{$_} if $level < $device->{$_};
	}
	$status = $level if $status < $level;

	$device->{level} = $level;
	$device->{info}  = sprintf 'drbd%d:cs=%s|ro=%s|pro=%s|ds=%s|pds=%s|extra=%s',  $id, $device->{cs}, $device->{ro}, $device->{pro}, $device->{ds}, $device->{pds}, $extra;
	$device->{short} = sprintf 'drbd%d: %0.3s%0.3s%0.3s%s', $id, $device->{ro}, $device->{cs}, $device->{ds}, $extra; # Role and connstate reversed, like old check_drbd

	foreach (qw( ns nr dw dr al bm )) {
		my $value = $device->{$_};
		defined $value
			or next;
		perfdata "${_}=${value}";
	}

	foreach (qw( lo pe ua ap oos )) {
		my $value = $device->{$_};
		defined $value
			or next;
		perfdata "${_}=${value}";
	}
}

@state
	or critical 'No DRBD volumes present';

if ($status) {
	my $message = join ', ', map $_->{info}, grep { defined and $_->{level} } @state;
	if ($status == WARNING) {
		warning $message;
	} else {
		critical $message;
	}
} else {
	my $message = join ', ', map { ($verbose || @state == 1) ? $_->{info} : $_->{short} } grep defined, @state;
	ok $message;
}

die;

###########################################################################

sub help {
	print <<EOF;
Usage: @{[BASENAME]} [OPTION...]
Check DRBD resources.

 Plugin options:
  -v, --verbose              Increase verbosity

 Help options:
  -?, --help                 Give this help list
      --usage                Give a short usage message
EOF
}

sub usage {
	print <<EOF;
Usage: @{[BASENAME]} [-v?] [--verbose] [--help] [--usage]
EOF
}

###########################################################################

{
	my @perfdata;

	sub perfdata { push @perfdata, @_ }

	sub _exit {
		my ($status, $message) = @_;

		if (defined $message) {
			print $message;
		} else {
			print qw( OK WARNING CRITICAL )[$status] || 'UNKNOWN';
		}
		if (my $perfdata = shift @perfdata) {
			print "|$perfdata";
		}
#		print "\n";
		if (@perfdata) {
			print '|';
			print map "$_|", @perfdata;
		}
		print "\n";
		exit $status;
	}
}

sub ok       { _exit OK,       @_ }
sub warning  { _exit WARNING,  @_ }
sub critical { _exit CRITICAL, @_ }
sub unknown  { _exit UNKNOWN,  @_ }

###########################################################################

sub get_state {
	my $io = new IO::File(STATE_FILE)
		or critical "Could not open @{[STATE_FILE]} for reading: $!";

# 0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----
#    ns:0 nr:20492 dw:20480 dr:124 al:5 bm:1296 lo:0 pe:0 ua:0 ap:0 ep:1 wo:d oos:0

	my @state;
	my $device;
	while (<$io>) {
		if (m(^ \s* (\d+): \s* cs:(\w+) \s+ (?:ro|st):(\w+)/(\w+) \s+ ds:(\w+)/(\w+) \s+ \S+ \s+ (\S+))x) {
			$device = $state[$1] = {
				cs  => $2,
				ro  => $3,
				pro => $4,
				ds  => $5,
				pds => $6,
				iof => $7,
			};
			next;
		};

		$device or next;
		$device->{$1} = $2 while /(\w+):(\S+)/g;
	}

	@state;
}
