You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
238 lines
8.5 KiB
238 lines
8.5 KiB
From c670ebe177f0b67da2804f7b3862b4c141a032fa Mon Sep 17 00:00:00 2001 |
|
From: "Thierno IB. BARRY" <ibrahima.br@gmail.com> |
|
Date: Tue, 26 Apr 2016 20:21:24 +0200 |
|
Subject: [PATCH 03/10] Improve the cluster check script to make more nagios |
|
friendly (#69) |
|
|
|
- Use request function as other scripts |
|
- Use check_threshold instead of manual check (close #67) |
|
- Add an expected node list in the cluster (close #68) |
|
--- |
|
scripts/check_rabbitmq_cluster | 138 +++++++++++++++++++++++++-------- |
|
t/1_checks_scripts.t | 4 +- |
|
2 files changed, 107 insertions(+), 35 deletions(-) |
|
|
|
diff --git a/scripts/check_rabbitmq_cluster b/scripts/check_rabbitmq_cluster |
|
index 534f01f..f4da8cb 100755 |
|
--- a/scripts/check_rabbitmq_cluster |
|
+++ b/scripts/check_rabbitmq_cluster |
|
@@ -7,7 +7,8 @@ |
|
use strict; |
|
use warnings; |
|
|
|
-use Monitoring::Plugin; |
|
+use Monitoring::Plugin qw(OK CRITICAL WARNING UNKNOWN); |
|
+use Monitoring::Plugin::Functions qw(%STATUS_TEXT); |
|
use LWP::UserAgent; |
|
use URI::Escape; |
|
use JSON; |
|
@@ -73,18 +74,23 @@ $p->add_arg(spec => 'proxyurl=s', |
|
help => "Use proxy url like http://proxy.domain.com:8080", |
|
); |
|
|
|
+$p->add_arg(spec => 'nodes|n=s', |
|
+ help => "Comma separated list of expected nodes in the cluster", |
|
+); |
|
+ |
|
$p->add_arg( |
|
- spec => 'warning|w=i', |
|
+ spec => 'warning|w=s', |
|
help => |
|
-qq{-w, --warning=THRESHOLD |
|
+qq{-w, --warning=THRESHOLD[,THRESHOLD[,THRESHOLD]] |
|
Warning thresholds specified in order that the metrics are returned. |
|
Specify -1 if no warning threshold.}, |
|
+ |
|
); |
|
|
|
$p->add_arg( |
|
- spec => 'critical|c=i', |
|
+ spec => 'critical|c=s', |
|
help => |
|
-qq{-c, --critical=THRESHOLD |
|
+qq{-c, --critical=THRESHOLD[,THRESHOLD[,THRESHOLD]] |
|
Critical thresholds specified in order that the metrics are returned. |
|
Specify -1 if no critical threshold.}, |
|
); |
|
@@ -92,11 +98,34 @@ qq{-c, --critical=THRESHOLD |
|
# Parse arguments and process standard ones (e.g. usage, help, version) |
|
$p->getopts; |
|
|
|
+# perform sanity checking on command line options |
|
+my %warning; |
|
+if (defined $p->opts->warning) { |
|
+ my @warning = split(',', $p->opts->warning); |
|
+ $p->nagios_die("You should specify 1 to 3 ranges for --warning argument") unless $#warning < 3; |
|
+ |
|
+ $warning{'nb_running_node'} = shift @warning; |
|
+ $warning{'nb_running_disc_node'} = shift @warning; |
|
+ $warning{'nb_running_ram_node'} = shift @warning; |
|
+} |
|
+ |
|
+my %critical; |
|
+if (defined $p->opts->critical) { |
|
+ my @critical = split(',', $p->opts->critical); |
|
+ $p->nagios_die("You should specify specify 1 to 3 ranges for --critical argument") unless $#critical < 3; |
|
+ |
|
+ $critical{'nb_running_node'} = shift @critical; |
|
+ $critical{'nb_running_disc_node'} = shift @critical; |
|
+ $critical{'nb_running_ram_node'} = shift @critical; |
|
+} |
|
+ |
|
+# check stuff. |
|
my $hostname=$p->opts->hostname; |
|
my $port=$p->opts->port; |
|
|
|
my $url = sprintf("http%s://%s:%d/api/nodes", ($p->opts->ssl ? "s" : ""), $hostname, $port); |
|
my $ua = LWP::UserAgent->new; |
|
+ |
|
if (defined $p->opts->proxyurl) |
|
{ |
|
$ua->proxy('http', $p->opts->proxyurl); |
|
@@ -110,40 +139,85 @@ $ua->timeout($p->opts->timeout); |
|
if ($p->opts->ssl and $ua->can('ssl_opts')) { |
|
$ua->ssl_opts(verify_hostname => $p->opts->ssl_strict); |
|
} |
|
-my $req = HTTP::Request->new(GET => $url); |
|
-$req->authorization_basic($p->opts->username, $p->opts->password); |
|
-my $res = $ua->request($req); |
|
- |
|
-if (!$res->is_success) { |
|
- # Deal with standard error conditions - make the messages more sensible |
|
- if ($res->code == 400) { |
|
- my $bodyref = decode_json $res->content; |
|
- $p->nagios_exit(CRITICAL, $bodyref->{'reason'}); |
|
- } |
|
- $res->code == 404 and $p->nagios_die("Not found"); |
|
- $res->code == 401 and $p->nagios_die("Access refused"); |
|
- if ($res->code < 200 or $res->code > 400 ) { |
|
- $p->nagios_exit(CRITICAL, "Received ".$res->status_line); |
|
- } |
|
+ |
|
+my ($retcode, $result) = request($url); |
|
+if ($retcode != 200) { |
|
+ $p->nagios_exit(CRITICAL, "$result : $url"); |
|
} |
|
|
|
-my @nodes = @{ decode_json $res->content }; |
|
-my $count = 0; |
|
-foreach my $node ( @nodes ) { |
|
- if ($node->{"name"} && $node->{"running"}) { |
|
- $count++; |
|
- } |
|
+my $values = {}; |
|
+$values->{'running_nodes'} = (); |
|
+$values->{'nb_running_node'} = 0; |
|
+$values->{'nb_running_disc_node'} = 0; |
|
+$values->{'nb_running_ram_node'} = 0; |
|
+ |
|
+foreach my $node ( @$result ) { |
|
+ if ($node->{"name"} && $node->{"running"}) { |
|
+ push @{ $values->{'running_nodes'} }, $node->{"name"}; |
|
+ |
|
+ $values->{'nb_running_node'}++; |
|
+ $values->{'nb_running_disc_node'}++ if ($node->{"type"} && $node->{"type"} eq "disc"); |
|
+ $values->{'nb_running_ram_node'}++ if ($node->{"type"} && $node->{"type"} eq "ram"); |
|
+ } |
|
} |
|
|
|
-if ($p->opts->critical && $count <= $p->opts->critical) { |
|
- $p->nagios_exit( CRITICAL, "The cluster has $count nodes" ); |
|
+my $code = 0; |
|
+my $message = ""; |
|
+ |
|
+if (defined($p->opts->nodes)) { |
|
+ my @nodes = split(',', $p->opts->nodes); |
|
+ my @excluded_nodes = diff(\@nodes, \@{ $values->{'running_nodes'} }); |
|
+ my $nb_excluded_nodes = @excluded_nodes; |
|
+ ($code, $message) = (OK, "All nodes are running"); |
|
+ ($code, $message) = (CRITICAL, "$nb_excluded_nodes failed cluster node: " . join(',', @excluded_nodes)) if($nb_excluded_nodes ne 0); |
|
+} |
|
+else { |
|
+ my @metrics = ("nb_running_node", "nb_running_disc_node", "nb_running_ram_node"); |
|
+ for my $metric (@metrics) { |
|
+ my $warning = undef; |
|
+ $warning = $warning{$metric} if (defined $warning{$metric} and $warning{$metric} ne -1); |
|
+ |
|
+ my $critical = undef; |
|
+ $critical = $critical{$metric} if (defined $critical{$metric} and $critical{$metric} ne -1); |
|
+ |
|
+ my $value = 0; |
|
+ $value = $values->{$metric} if defined $values->{$metric}; |
|
+ my $code = $p->check_threshold(check => $value, warning => $warning, critical=> $critical); |
|
+ $p->add_message($code, sprintf("$metric ".$STATUS_TEXT{$code}." (%d)", $value)); |
|
+ } |
|
+ ($code, $message) = $p->check_messages(join_all=>', '); |
|
} |
|
|
|
-if ($p->opts->warning && $count <= $p->opts->warning) { |
|
- $p->nagios_exit( WARNING, "The cluster has $count nodes" ); |
|
+$p->nagios_exit(return_code => $code, message => $message); |
|
+ |
|
+sub request { |
|
+ my ($url) = @_; |
|
+ my $req = HTTP::Request->new(GET => $url); |
|
+ $req->authorization_basic($p->opts->username, $p->opts->password); |
|
+ my $res = $ua->request($req); |
|
+ |
|
+ if (!$res->is_success) { |
|
+ # Deal with standard error conditions - make the messages more sensible |
|
+ if ($res->code == 400) { |
|
+ my $bodyref = decode_json $res->content; |
|
+ return (400, $bodyref->{'reason'}); |
|
+ |
|
+ } |
|
+ $res->code == 404 and return (404, "Not Found"); |
|
+ $res->code == 401 and return (401, "Access Refused"); |
|
+ $res->status_line =~ /Can\'t connect/ and return (500, "Connection Refused : $url"); |
|
+ if ($res->code < 200 or $res->code > 400 ) { |
|
+ return ($res->code, "Received ".$res->status_line); |
|
+ } |
|
+ } |
|
+ my $bodyref = decode_json $res->content; |
|
+ return($res->code, $bodyref); |
|
} |
|
|
|
-$p->nagios_exit( OK, "The cluster has $count nodes" ); |
|
+sub diff { |
|
+ my ($array_1, $array_2) = (@_); |
|
+ return grep { my $baz = $_; grep($_ ne $baz, @$array_2) } @$array_1; |
|
+} |
|
|
|
=head1 NAME |
|
|
|
@@ -206,7 +280,7 @@ is needed is to specify the host to connect to: |
|
|
|
This returns a standard Nagios result: |
|
|
|
- RABBITMQ_NODE OK - The cluster has 3 nodes |
|
+ RABBITMQ_CLUSTER OK - The cluster has 3 nodes |
|
|
|
=head1 ERRORS |
|
|
|
diff --git a/t/1_checks_scripts.t b/t/1_checks_scripts.t |
|
index ad9a4b7..7672f84 100644 |
|
--- a/t/1_checks_scripts.t |
|
+++ b/t/1_checks_scripts.t |
|
@@ -13,9 +13,7 @@ $args = sprintf("--hostname=%s --username=%s --password=%s", $rabbit_hostname, $ |
|
script_runs(['scripts/check_rabbitmq_aliveness', $args]); |
|
|
|
# Checks on check_rabbitmq_cluster |
|
-$regex = /The cluster has \d+ nodes/im; |
|
script_runs(['scripts/check_rabbitmq_cluster', ($args, ' -w 1 -c 1')]); |
|
-script_stdout_like $regex, 'scripts/check_rabbitmq_cluster stdout is correct'; |
|
|
|
# Checks on check_rabbitmq_connections |
|
script_runs(['scripts/check_rabbitmq_connections', $args]); |
|
@@ -35,7 +33,7 @@ script_runs(['scripts/check_rabbitmq_queue', ($args, '--queue=aliveness-test')]) |
|
# Checks on check_rabbitmq_server |
|
$regex = /(Memory=.*)\s(Process=.*)\s(FD=.*)/im; |
|
script_runs(['scripts/check_rabbitmq_server', ($args, "--node=${rabbit_servername}")]); |
|
-script_stdout_like $regex, 'scripts/check_rabbitmq_server stdout is correct'; |
|
+script_stdout_like $regex, 'Scripts/check_rabbitmq_server stdout is correct'; |
|
|
|
# Checks on check_rabbitmq_watermark |
|
script_runs(['scripts/check_rabbitmq_watermark', ($args, "--node=${rabbit_servername}")]); |
|
-- |
|
2.20.1 |
|
|
|
|