You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

238 lines
8.5 KiB

From c670ebe177f0b67da2804f7b3862b4c141a032fa Mon Sep 17 00:00:00 2001
From: "Thierno IB. BARRY" <ibrahima.br@gmail.com>
Date: Tue, 26 Apr 2016 20:21:24 +0200
Subject: [PATCH 03/10] Improve the cluster check script to make more nagios
friendly (#69)
- Use request function as other scripts
- Use check_threshold instead of manual check (close #67)
- Add an expected node list in the cluster (close #68)
---
scripts/check_rabbitmq_cluster | 138 +++++++++++++++++++++++++--------
t/1_checks_scripts.t | 4 +-
2 files changed, 107 insertions(+), 35 deletions(-)
diff --git a/scripts/check_rabbitmq_cluster b/scripts/check_rabbitmq_cluster
index 534f01f..f4da8cb 100755
--- a/scripts/check_rabbitmq_cluster
+++ b/scripts/check_rabbitmq_cluster
@@ -7,7 +7,8 @@
use strict;
use warnings;
-use Monitoring::Plugin;
+use Monitoring::Plugin qw(OK CRITICAL WARNING UNKNOWN);
+use Monitoring::Plugin::Functions qw(%STATUS_TEXT);
use LWP::UserAgent;
use URI::Escape;
use JSON;
@@ -73,18 +74,23 @@ $p->add_arg(spec => 'proxyurl=s',
help => "Use proxy url like http://proxy.domain.com:8080",
);
+$p->add_arg(spec => 'nodes|n=s',
+ help => "Comma separated list of expected nodes in the cluster",
+);
+
$p->add_arg(
- spec => 'warning|w=i',
+ spec => 'warning|w=s',
help =>
-qq{-w, --warning=THRESHOLD
+qq{-w, --warning=THRESHOLD[,THRESHOLD[,THRESHOLD]]
Warning thresholds specified in order that the metrics are returned.
Specify -1 if no warning threshold.},
+
);
$p->add_arg(
- spec => 'critical|c=i',
+ spec => 'critical|c=s',
help =>
-qq{-c, --critical=THRESHOLD
+qq{-c, --critical=THRESHOLD[,THRESHOLD[,THRESHOLD]]
Critical thresholds specified in order that the metrics are returned.
Specify -1 if no critical threshold.},
);
@@ -92,11 +98,34 @@ qq{-c, --critical=THRESHOLD
# Parse arguments and process standard ones (e.g. usage, help, version)
$p->getopts;
+# perform sanity checking on command line options
+my %warning;
+if (defined $p->opts->warning) {
+ my @warning = split(',', $p->opts->warning);
+ $p->nagios_die("You should specify 1 to 3 ranges for --warning argument") unless $#warning < 3;
+
+ $warning{'nb_running_node'} = shift @warning;
+ $warning{'nb_running_disc_node'} = shift @warning;
+ $warning{'nb_running_ram_node'} = shift @warning;
+}
+
+my %critical;
+if (defined $p->opts->critical) {
+ my @critical = split(',', $p->opts->critical);
+ $p->nagios_die("You should specify specify 1 to 3 ranges for --critical argument") unless $#critical < 3;
+
+ $critical{'nb_running_node'} = shift @critical;
+ $critical{'nb_running_disc_node'} = shift @critical;
+ $critical{'nb_running_ram_node'} = shift @critical;
+}
+
+# check stuff.
my $hostname=$p->opts->hostname;
my $port=$p->opts->port;
my $url = sprintf("http%s://%s:%d/api/nodes", ($p->opts->ssl ? "s" : ""), $hostname, $port);
my $ua = LWP::UserAgent->new;
+
if (defined $p->opts->proxyurl)
{
$ua->proxy('http', $p->opts->proxyurl);
@@ -110,40 +139,85 @@ $ua->timeout($p->opts->timeout);
if ($p->opts->ssl and $ua->can('ssl_opts')) {
$ua->ssl_opts(verify_hostname => $p->opts->ssl_strict);
}
-my $req = HTTP::Request->new(GET => $url);
-$req->authorization_basic($p->opts->username, $p->opts->password);
-my $res = $ua->request($req);
-
-if (!$res->is_success) {
- # Deal with standard error conditions - make the messages more sensible
- if ($res->code == 400) {
- my $bodyref = decode_json $res->content;
- $p->nagios_exit(CRITICAL, $bodyref->{'reason'});
- }
- $res->code == 404 and $p->nagios_die("Not found");
- $res->code == 401 and $p->nagios_die("Access refused");
- if ($res->code < 200 or $res->code > 400 ) {
- $p->nagios_exit(CRITICAL, "Received ".$res->status_line);
- }
+
+my ($retcode, $result) = request($url);
+if ($retcode != 200) {
+ $p->nagios_exit(CRITICAL, "$result : $url");
}
-my @nodes = @{ decode_json $res->content };
-my $count = 0;
-foreach my $node ( @nodes ) {
- if ($node->{"name"} && $node->{"running"}) {
- $count++;
- }
+my $values = {};
+$values->{'running_nodes'} = ();
+$values->{'nb_running_node'} = 0;
+$values->{'nb_running_disc_node'} = 0;
+$values->{'nb_running_ram_node'} = 0;
+
+foreach my $node ( @$result ) {
+ if ($node->{"name"} && $node->{"running"}) {
+ push @{ $values->{'running_nodes'} }, $node->{"name"};
+
+ $values->{'nb_running_node'}++;
+ $values->{'nb_running_disc_node'}++ if ($node->{"type"} && $node->{"type"} eq "disc");
+ $values->{'nb_running_ram_node'}++ if ($node->{"type"} && $node->{"type"} eq "ram");
+ }
}
-if ($p->opts->critical && $count <= $p->opts->critical) {
- $p->nagios_exit( CRITICAL, "The cluster has $count nodes" );
+my $code = 0;
+my $message = "";
+
+if (defined($p->opts->nodes)) {
+ my @nodes = split(',', $p->opts->nodes);
+ my @excluded_nodes = diff(\@nodes, \@{ $values->{'running_nodes'} });
+ my $nb_excluded_nodes = @excluded_nodes;
+ ($code, $message) = (OK, "All nodes are running");
+ ($code, $message) = (CRITICAL, "$nb_excluded_nodes failed cluster node: " . join(',', @excluded_nodes)) if($nb_excluded_nodes ne 0);
+}
+else {
+ my @metrics = ("nb_running_node", "nb_running_disc_node", "nb_running_ram_node");
+ for my $metric (@metrics) {
+ my $warning = undef;
+ $warning = $warning{$metric} if (defined $warning{$metric} and $warning{$metric} ne -1);
+
+ my $critical = undef;
+ $critical = $critical{$metric} if (defined $critical{$metric} and $critical{$metric} ne -1);
+
+ my $value = 0;
+ $value = $values->{$metric} if defined $values->{$metric};
+ my $code = $p->check_threshold(check => $value, warning => $warning, critical=> $critical);
+ $p->add_message($code, sprintf("$metric ".$STATUS_TEXT{$code}." (%d)", $value));
+ }
+ ($code, $message) = $p->check_messages(join_all=>', ');
}
-if ($p->opts->warning && $count <= $p->opts->warning) {
- $p->nagios_exit( WARNING, "The cluster has $count nodes" );
+$p->nagios_exit(return_code => $code, message => $message);
+
+sub request {
+ my ($url) = @_;
+ my $req = HTTP::Request->new(GET => $url);
+ $req->authorization_basic($p->opts->username, $p->opts->password);
+ my $res = $ua->request($req);
+
+ if (!$res->is_success) {
+ # Deal with standard error conditions - make the messages more sensible
+ if ($res->code == 400) {
+ my $bodyref = decode_json $res->content;
+ return (400, $bodyref->{'reason'});
+
+ }
+ $res->code == 404 and return (404, "Not Found");
+ $res->code == 401 and return (401, "Access Refused");
+ $res->status_line =~ /Can\'t connect/ and return (500, "Connection Refused : $url");
+ if ($res->code < 200 or $res->code > 400 ) {
+ return ($res->code, "Received ".$res->status_line);
+ }
+ }
+ my $bodyref = decode_json $res->content;
+ return($res->code, $bodyref);
}
-$p->nagios_exit( OK, "The cluster has $count nodes" );
+sub diff {
+ my ($array_1, $array_2) = (@_);
+ return grep { my $baz = $_; grep($_ ne $baz, @$array_2) } @$array_1;
+}
=head1 NAME
@@ -206,7 +280,7 @@ is needed is to specify the host to connect to:
This returns a standard Nagios result:
- RABBITMQ_NODE OK - The cluster has 3 nodes
+ RABBITMQ_CLUSTER OK - The cluster has 3 nodes
=head1 ERRORS
diff --git a/t/1_checks_scripts.t b/t/1_checks_scripts.t
index ad9a4b7..7672f84 100644
--- a/t/1_checks_scripts.t
+++ b/t/1_checks_scripts.t
@@ -13,9 +13,7 @@ $args = sprintf("--hostname=%s --username=%s --password=%s", $rabbit_hostname, $
script_runs(['scripts/check_rabbitmq_aliveness', $args]);
# Checks on check_rabbitmq_cluster
-$regex = /The cluster has \d+ nodes/im;
script_runs(['scripts/check_rabbitmq_cluster', ($args, ' -w 1 -c 1')]);
-script_stdout_like $regex, 'scripts/check_rabbitmq_cluster stdout is correct';
# Checks on check_rabbitmq_connections
script_runs(['scripts/check_rabbitmq_connections', $args]);
@@ -35,7 +33,7 @@ script_runs(['scripts/check_rabbitmq_queue', ($args, '--queue=aliveness-test')])
# Checks on check_rabbitmq_server
$regex = /(Memory=.*)\s(Process=.*)\s(FD=.*)/im;
script_runs(['scripts/check_rabbitmq_server', ($args, "--node=${rabbit_servername}")]);
-script_stdout_like $regex, 'scripts/check_rabbitmq_server stdout is correct';
+script_stdout_like $regex, 'Scripts/check_rabbitmq_server stdout is correct';
# Checks on check_rabbitmq_watermark
script_runs(['scripts/check_rabbitmq_watermark', ($args, "--node=${rabbit_servername}")]);
--
2.20.1