You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
239 lines
8.5 KiB
239 lines
8.5 KiB
3 years ago
|
From c670ebe177f0b67da2804f7b3862b4c141a032fa Mon Sep 17 00:00:00 2001
|
||
|
From: "Thierno IB. BARRY" <ibrahima.br@gmail.com>
|
||
|
Date: Tue, 26 Apr 2016 20:21:24 +0200
|
||
|
Subject: [PATCH 03/10] Improve the cluster check script to make more nagios
|
||
|
friendly (#69)
|
||
|
|
||
|
- Use request function as other scripts
|
||
|
- Use check_threshold instead of manual check (close #67)
|
||
|
- Add an expected node list in the cluster (close #68)
|
||
|
---
|
||
|
scripts/check_rabbitmq_cluster | 138 +++++++++++++++++++++++++--------
|
||
|
t/1_checks_scripts.t | 4 +-
|
||
|
2 files changed, 107 insertions(+), 35 deletions(-)
|
||
|
|
||
|
diff --git a/scripts/check_rabbitmq_cluster b/scripts/check_rabbitmq_cluster
|
||
|
index 534f01f..f4da8cb 100755
|
||
|
--- a/scripts/check_rabbitmq_cluster
|
||
|
+++ b/scripts/check_rabbitmq_cluster
|
||
|
@@ -7,7 +7,8 @@
|
||
|
use strict;
|
||
|
use warnings;
|
||
|
|
||
|
-use Monitoring::Plugin;
|
||
|
+use Monitoring::Plugin qw(OK CRITICAL WARNING UNKNOWN);
|
||
|
+use Monitoring::Plugin::Functions qw(%STATUS_TEXT);
|
||
|
use LWP::UserAgent;
|
||
|
use URI::Escape;
|
||
|
use JSON;
|
||
|
@@ -73,18 +74,23 @@ $p->add_arg(spec => 'proxyurl=s',
|
||
|
help => "Use proxy url like http://proxy.domain.com:8080",
|
||
|
);
|
||
|
|
||
|
+$p->add_arg(spec => 'nodes|n=s',
|
||
|
+ help => "Comma separated list of expected nodes in the cluster",
|
||
|
+);
|
||
|
+
|
||
|
$p->add_arg(
|
||
|
- spec => 'warning|w=i',
|
||
|
+ spec => 'warning|w=s',
|
||
|
help =>
|
||
|
-qq{-w, --warning=THRESHOLD
|
||
|
+qq{-w, --warning=THRESHOLD[,THRESHOLD[,THRESHOLD]]
|
||
|
Warning thresholds specified in order that the metrics are returned.
|
||
|
Specify -1 if no warning threshold.},
|
||
|
+
|
||
|
);
|
||
|
|
||
|
$p->add_arg(
|
||
|
- spec => 'critical|c=i',
|
||
|
+ spec => 'critical|c=s',
|
||
|
help =>
|
||
|
-qq{-c, --critical=THRESHOLD
|
||
|
+qq{-c, --critical=THRESHOLD[,THRESHOLD[,THRESHOLD]]
|
||
|
Critical thresholds specified in order that the metrics are returned.
|
||
|
Specify -1 if no critical threshold.},
|
||
|
);
|
||
|
@@ -92,11 +98,34 @@ qq{-c, --critical=THRESHOLD
|
||
|
# Parse arguments and process standard ones (e.g. usage, help, version)
|
||
|
$p->getopts;
|
||
|
|
||
|
+# perform sanity checking on command line options
|
||
|
+my %warning;
|
||
|
+if (defined $p->opts->warning) {
|
||
|
+ my @warning = split(',', $p->opts->warning);
|
||
|
+ $p->nagios_die("You should specify 1 to 3 ranges for --warning argument") unless $#warning < 3;
|
||
|
+
|
||
|
+ $warning{'nb_running_node'} = shift @warning;
|
||
|
+ $warning{'nb_running_disc_node'} = shift @warning;
|
||
|
+ $warning{'nb_running_ram_node'} = shift @warning;
|
||
|
+}
|
||
|
+
|
||
|
+my %critical;
|
||
|
+if (defined $p->opts->critical) {
|
||
|
+ my @critical = split(',', $p->opts->critical);
|
||
|
+ $p->nagios_die("You should specify specify 1 to 3 ranges for --critical argument") unless $#critical < 3;
|
||
|
+
|
||
|
+ $critical{'nb_running_node'} = shift @critical;
|
||
|
+ $critical{'nb_running_disc_node'} = shift @critical;
|
||
|
+ $critical{'nb_running_ram_node'} = shift @critical;
|
||
|
+}
|
||
|
+
|
||
|
+# check stuff.
|
||
|
my $hostname=$p->opts->hostname;
|
||
|
my $port=$p->opts->port;
|
||
|
|
||
|
my $url = sprintf("http%s://%s:%d/api/nodes", ($p->opts->ssl ? "s" : ""), $hostname, $port);
|
||
|
my $ua = LWP::UserAgent->new;
|
||
|
+
|
||
|
if (defined $p->opts->proxyurl)
|
||
|
{
|
||
|
$ua->proxy('http', $p->opts->proxyurl);
|
||
|
@@ -110,40 +139,85 @@ $ua->timeout($p->opts->timeout);
|
||
|
if ($p->opts->ssl and $ua->can('ssl_opts')) {
|
||
|
$ua->ssl_opts(verify_hostname => $p->opts->ssl_strict);
|
||
|
}
|
||
|
-my $req = HTTP::Request->new(GET => $url);
|
||
|
-$req->authorization_basic($p->opts->username, $p->opts->password);
|
||
|
-my $res = $ua->request($req);
|
||
|
-
|
||
|
-if (!$res->is_success) {
|
||
|
- # Deal with standard error conditions - make the messages more sensible
|
||
|
- if ($res->code == 400) {
|
||
|
- my $bodyref = decode_json $res->content;
|
||
|
- $p->nagios_exit(CRITICAL, $bodyref->{'reason'});
|
||
|
- }
|
||
|
- $res->code == 404 and $p->nagios_die("Not found");
|
||
|
- $res->code == 401 and $p->nagios_die("Access refused");
|
||
|
- if ($res->code < 200 or $res->code > 400 ) {
|
||
|
- $p->nagios_exit(CRITICAL, "Received ".$res->status_line);
|
||
|
- }
|
||
|
+
|
||
|
+my ($retcode, $result) = request($url);
|
||
|
+if ($retcode != 200) {
|
||
|
+ $p->nagios_exit(CRITICAL, "$result : $url");
|
||
|
}
|
||
|
|
||
|
-my @nodes = @{ decode_json $res->content };
|
||
|
-my $count = 0;
|
||
|
-foreach my $node ( @nodes ) {
|
||
|
- if ($node->{"name"} && $node->{"running"}) {
|
||
|
- $count++;
|
||
|
- }
|
||
|
+my $values = {};
|
||
|
+$values->{'running_nodes'} = ();
|
||
|
+$values->{'nb_running_node'} = 0;
|
||
|
+$values->{'nb_running_disc_node'} = 0;
|
||
|
+$values->{'nb_running_ram_node'} = 0;
|
||
|
+
|
||
|
+foreach my $node ( @$result ) {
|
||
|
+ if ($node->{"name"} && $node->{"running"}) {
|
||
|
+ push @{ $values->{'running_nodes'} }, $node->{"name"};
|
||
|
+
|
||
|
+ $values->{'nb_running_node'}++;
|
||
|
+ $values->{'nb_running_disc_node'}++ if ($node->{"type"} && $node->{"type"} eq "disc");
|
||
|
+ $values->{'nb_running_ram_node'}++ if ($node->{"type"} && $node->{"type"} eq "ram");
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
-if ($p->opts->critical && $count <= $p->opts->critical) {
|
||
|
- $p->nagios_exit( CRITICAL, "The cluster has $count nodes" );
|
||
|
+my $code = 0;
|
||
|
+my $message = "";
|
||
|
+
|
||
|
+if (defined($p->opts->nodes)) {
|
||
|
+ my @nodes = split(',', $p->opts->nodes);
|
||
|
+ my @excluded_nodes = diff(\@nodes, \@{ $values->{'running_nodes'} });
|
||
|
+ my $nb_excluded_nodes = @excluded_nodes;
|
||
|
+ ($code, $message) = (OK, "All nodes are running");
|
||
|
+ ($code, $message) = (CRITICAL, "$nb_excluded_nodes failed cluster node: " . join(',', @excluded_nodes)) if($nb_excluded_nodes ne 0);
|
||
|
+}
|
||
|
+else {
|
||
|
+ my @metrics = ("nb_running_node", "nb_running_disc_node", "nb_running_ram_node");
|
||
|
+ for my $metric (@metrics) {
|
||
|
+ my $warning = undef;
|
||
|
+ $warning = $warning{$metric} if (defined $warning{$metric} and $warning{$metric} ne -1);
|
||
|
+
|
||
|
+ my $critical = undef;
|
||
|
+ $critical = $critical{$metric} if (defined $critical{$metric} and $critical{$metric} ne -1);
|
||
|
+
|
||
|
+ my $value = 0;
|
||
|
+ $value = $values->{$metric} if defined $values->{$metric};
|
||
|
+ my $code = $p->check_threshold(check => $value, warning => $warning, critical=> $critical);
|
||
|
+ $p->add_message($code, sprintf("$metric ".$STATUS_TEXT{$code}." (%d)", $value));
|
||
|
+ }
|
||
|
+ ($code, $message) = $p->check_messages(join_all=>', ');
|
||
|
}
|
||
|
|
||
|
-if ($p->opts->warning && $count <= $p->opts->warning) {
|
||
|
- $p->nagios_exit( WARNING, "The cluster has $count nodes" );
|
||
|
+$p->nagios_exit(return_code => $code, message => $message);
|
||
|
+
|
||
|
+sub request {
|
||
|
+ my ($url) = @_;
|
||
|
+ my $req = HTTP::Request->new(GET => $url);
|
||
|
+ $req->authorization_basic($p->opts->username, $p->opts->password);
|
||
|
+ my $res = $ua->request($req);
|
||
|
+
|
||
|
+ if (!$res->is_success) {
|
||
|
+ # Deal with standard error conditions - make the messages more sensible
|
||
|
+ if ($res->code == 400) {
|
||
|
+ my $bodyref = decode_json $res->content;
|
||
|
+ return (400, $bodyref->{'reason'});
|
||
|
+
|
||
|
+ }
|
||
|
+ $res->code == 404 and return (404, "Not Found");
|
||
|
+ $res->code == 401 and return (401, "Access Refused");
|
||
|
+ $res->status_line =~ /Can\'t connect/ and return (500, "Connection Refused : $url");
|
||
|
+ if ($res->code < 200 or $res->code > 400 ) {
|
||
|
+ return ($res->code, "Received ".$res->status_line);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ my $bodyref = decode_json $res->content;
|
||
|
+ return($res->code, $bodyref);
|
||
|
}
|
||
|
|
||
|
-$p->nagios_exit( OK, "The cluster has $count nodes" );
|
||
|
+sub diff {
|
||
|
+ my ($array_1, $array_2) = (@_);
|
||
|
+ return grep { my $baz = $_; grep($_ ne $baz, @$array_2) } @$array_1;
|
||
|
+}
|
||
|
|
||
|
=head1 NAME
|
||
|
|
||
|
@@ -206,7 +280,7 @@ is needed is to specify the host to connect to:
|
||
|
|
||
|
This returns a standard Nagios result:
|
||
|
|
||
|
- RABBITMQ_NODE OK - The cluster has 3 nodes
|
||
|
+ RABBITMQ_CLUSTER OK - The cluster has 3 nodes
|
||
|
|
||
|
=head1 ERRORS
|
||
|
|
||
|
diff --git a/t/1_checks_scripts.t b/t/1_checks_scripts.t
|
||
|
index ad9a4b7..7672f84 100644
|
||
|
--- a/t/1_checks_scripts.t
|
||
|
+++ b/t/1_checks_scripts.t
|
||
|
@@ -13,9 +13,7 @@ $args = sprintf("--hostname=%s --username=%s --password=%s", $rabbit_hostname, $
|
||
|
script_runs(['scripts/check_rabbitmq_aliveness', $args]);
|
||
|
|
||
|
# Checks on check_rabbitmq_cluster
|
||
|
-$regex = /The cluster has \d+ nodes/im;
|
||
|
script_runs(['scripts/check_rabbitmq_cluster', ($args, ' -w 1 -c 1')]);
|
||
|
-script_stdout_like $regex, 'scripts/check_rabbitmq_cluster stdout is correct';
|
||
|
|
||
|
# Checks on check_rabbitmq_connections
|
||
|
script_runs(['scripts/check_rabbitmq_connections', $args]);
|
||
|
@@ -35,7 +33,7 @@ script_runs(['scripts/check_rabbitmq_queue', ($args, '--queue=aliveness-test')])
|
||
|
# Checks on check_rabbitmq_server
|
||
|
$regex = /(Memory=.*)\s(Process=.*)\s(FD=.*)/im;
|
||
|
script_runs(['scripts/check_rabbitmq_server', ($args, "--node=${rabbit_servername}")]);
|
||
|
-script_stdout_like $regex, 'scripts/check_rabbitmq_server stdout is correct';
|
||
|
+script_stdout_like $regex, 'Scripts/check_rabbitmq_server stdout is correct';
|
||
|
|
||
|
# Checks on check_rabbitmq_watermark
|
||
|
script_runs(['scripts/check_rabbitmq_watermark', ($args, "--node=${rabbit_servername}")]);
|
||
|
--
|
||
|
2.20.1
|
||
|
|