From c670ebe177f0b67da2804f7b3862b4c141a032fa Mon Sep 17 00:00:00 2001 From: "Thierno IB. BARRY" Date: Tue, 26 Apr 2016 20:21:24 +0200 Subject: [PATCH 03/10] Improve the cluster check script to make more nagios friendly (#69) - Use request function as other scripts - Use check_threshold instead of manual check (close #67) - Add an expected node list in the cluster (close #68) --- scripts/check_rabbitmq_cluster | 138 +++++++++++++++++++++++++-------- t/1_checks_scripts.t | 4 +- 2 files changed, 107 insertions(+), 35 deletions(-) diff --git a/scripts/check_rabbitmq_cluster b/scripts/check_rabbitmq_cluster index 534f01f..f4da8cb 100755 --- a/scripts/check_rabbitmq_cluster +++ b/scripts/check_rabbitmq_cluster @@ -7,7 +7,8 @@ use strict; use warnings; -use Monitoring::Plugin; +use Monitoring::Plugin qw(OK CRITICAL WARNING UNKNOWN); +use Monitoring::Plugin::Functions qw(%STATUS_TEXT); use LWP::UserAgent; use URI::Escape; use JSON; @@ -73,18 +74,23 @@ $p->add_arg(spec => 'proxyurl=s', help => "Use proxy url like http://proxy.domain.com:8080", ); +$p->add_arg(spec => 'nodes|n=s', + help => "Comma separated list of expected nodes in the cluster", +); + $p->add_arg( - spec => 'warning|w=i', + spec => 'warning|w=s', help => -qq{-w, --warning=THRESHOLD +qq{-w, --warning=THRESHOLD[,THRESHOLD[,THRESHOLD]] Warning thresholds specified in order that the metrics are returned. Specify -1 if no warning threshold.}, + ); $p->add_arg( - spec => 'critical|c=i', + spec => 'critical|c=s', help => -qq{-c, --critical=THRESHOLD +qq{-c, --critical=THRESHOLD[,THRESHOLD[,THRESHOLD]] Critical thresholds specified in order that the metrics are returned. Specify -1 if no critical threshold.}, ); @@ -92,11 +98,34 @@ qq{-c, --critical=THRESHOLD # Parse arguments and process standard ones (e.g. usage, help, version) $p->getopts; +# perform sanity checking on command line options +my %warning; +if (defined $p->opts->warning) { + my @warning = split(',', $p->opts->warning); + $p->nagios_die("You should specify 1 to 3 ranges for --warning argument") unless $#warning < 3; + + $warning{'nb_running_node'} = shift @warning; + $warning{'nb_running_disc_node'} = shift @warning; + $warning{'nb_running_ram_node'} = shift @warning; +} + +my %critical; +if (defined $p->opts->critical) { + my @critical = split(',', $p->opts->critical); + $p->nagios_die("You should specify specify 1 to 3 ranges for --critical argument") unless $#critical < 3; + + $critical{'nb_running_node'} = shift @critical; + $critical{'nb_running_disc_node'} = shift @critical; + $critical{'nb_running_ram_node'} = shift @critical; +} + +# check stuff. my $hostname=$p->opts->hostname; my $port=$p->opts->port; my $url = sprintf("http%s://%s:%d/api/nodes", ($p->opts->ssl ? "s" : ""), $hostname, $port); my $ua = LWP::UserAgent->new; + if (defined $p->opts->proxyurl) { $ua->proxy('http', $p->opts->proxyurl); @@ -110,40 +139,85 @@ $ua->timeout($p->opts->timeout); if ($p->opts->ssl and $ua->can('ssl_opts')) { $ua->ssl_opts(verify_hostname => $p->opts->ssl_strict); } -my $req = HTTP::Request->new(GET => $url); -$req->authorization_basic($p->opts->username, $p->opts->password); -my $res = $ua->request($req); - -if (!$res->is_success) { - # Deal with standard error conditions - make the messages more sensible - if ($res->code == 400) { - my $bodyref = decode_json $res->content; - $p->nagios_exit(CRITICAL, $bodyref->{'reason'}); - } - $res->code == 404 and $p->nagios_die("Not found"); - $res->code == 401 and $p->nagios_die("Access refused"); - if ($res->code < 200 or $res->code > 400 ) { - $p->nagios_exit(CRITICAL, "Received ".$res->status_line); - } + +my ($retcode, $result) = request($url); +if ($retcode != 200) { + $p->nagios_exit(CRITICAL, "$result : $url"); } -my @nodes = @{ decode_json $res->content }; -my $count = 0; -foreach my $node ( @nodes ) { - if ($node->{"name"} && $node->{"running"}) { - $count++; - } +my $values = {}; +$values->{'running_nodes'} = (); +$values->{'nb_running_node'} = 0; +$values->{'nb_running_disc_node'} = 0; +$values->{'nb_running_ram_node'} = 0; + +foreach my $node ( @$result ) { + if ($node->{"name"} && $node->{"running"}) { + push @{ $values->{'running_nodes'} }, $node->{"name"}; + + $values->{'nb_running_node'}++; + $values->{'nb_running_disc_node'}++ if ($node->{"type"} && $node->{"type"} eq "disc"); + $values->{'nb_running_ram_node'}++ if ($node->{"type"} && $node->{"type"} eq "ram"); + } } -if ($p->opts->critical && $count <= $p->opts->critical) { - $p->nagios_exit( CRITICAL, "The cluster has $count nodes" ); +my $code = 0; +my $message = ""; + +if (defined($p->opts->nodes)) { + my @nodes = split(',', $p->opts->nodes); + my @excluded_nodes = diff(\@nodes, \@{ $values->{'running_nodes'} }); + my $nb_excluded_nodes = @excluded_nodes; + ($code, $message) = (OK, "All nodes are running"); + ($code, $message) = (CRITICAL, "$nb_excluded_nodes failed cluster node: " . join(',', @excluded_nodes)) if($nb_excluded_nodes ne 0); +} +else { + my @metrics = ("nb_running_node", "nb_running_disc_node", "nb_running_ram_node"); + for my $metric (@metrics) { + my $warning = undef; + $warning = $warning{$metric} if (defined $warning{$metric} and $warning{$metric} ne -1); + + my $critical = undef; + $critical = $critical{$metric} if (defined $critical{$metric} and $critical{$metric} ne -1); + + my $value = 0; + $value = $values->{$metric} if defined $values->{$metric}; + my $code = $p->check_threshold(check => $value, warning => $warning, critical=> $critical); + $p->add_message($code, sprintf("$metric ".$STATUS_TEXT{$code}." (%d)", $value)); + } + ($code, $message) = $p->check_messages(join_all=>', '); } -if ($p->opts->warning && $count <= $p->opts->warning) { - $p->nagios_exit( WARNING, "The cluster has $count nodes" ); +$p->nagios_exit(return_code => $code, message => $message); + +sub request { + my ($url) = @_; + my $req = HTTP::Request->new(GET => $url); + $req->authorization_basic($p->opts->username, $p->opts->password); + my $res = $ua->request($req); + + if (!$res->is_success) { + # Deal with standard error conditions - make the messages more sensible + if ($res->code == 400) { + my $bodyref = decode_json $res->content; + return (400, $bodyref->{'reason'}); + + } + $res->code == 404 and return (404, "Not Found"); + $res->code == 401 and return (401, "Access Refused"); + $res->status_line =~ /Can\'t connect/ and return (500, "Connection Refused : $url"); + if ($res->code < 200 or $res->code > 400 ) { + return ($res->code, "Received ".$res->status_line); + } + } + my $bodyref = decode_json $res->content; + return($res->code, $bodyref); } -$p->nagios_exit( OK, "The cluster has $count nodes" ); +sub diff { + my ($array_1, $array_2) = (@_); + return grep { my $baz = $_; grep($_ ne $baz, @$array_2) } @$array_1; +} =head1 NAME @@ -206,7 +280,7 @@ is needed is to specify the host to connect to: This returns a standard Nagios result: - RABBITMQ_NODE OK - The cluster has 3 nodes + RABBITMQ_CLUSTER OK - The cluster has 3 nodes =head1 ERRORS diff --git a/t/1_checks_scripts.t b/t/1_checks_scripts.t index ad9a4b7..7672f84 100644 --- a/t/1_checks_scripts.t +++ b/t/1_checks_scripts.t @@ -13,9 +13,7 @@ $args = sprintf("--hostname=%s --username=%s --password=%s", $rabbit_hostname, $ script_runs(['scripts/check_rabbitmq_aliveness', $args]); # Checks on check_rabbitmq_cluster -$regex = /The cluster has \d+ nodes/im; script_runs(['scripts/check_rabbitmq_cluster', ($args, ' -w 1 -c 1')]); -script_stdout_like $regex, 'scripts/check_rabbitmq_cluster stdout is correct'; # Checks on check_rabbitmq_connections script_runs(['scripts/check_rabbitmq_connections', $args]); @@ -35,7 +33,7 @@ script_runs(['scripts/check_rabbitmq_queue', ($args, '--queue=aliveness-test')]) # Checks on check_rabbitmq_server $regex = /(Memory=.*)\s(Process=.*)\s(FD=.*)/im; script_runs(['scripts/check_rabbitmq_server', ($args, "--node=${rabbit_servername}")]); -script_stdout_like $regex, 'scripts/check_rabbitmq_server stdout is correct'; +script_stdout_like $regex, 'Scripts/check_rabbitmq_server stdout is correct'; # Checks on check_rabbitmq_watermark script_runs(['scripts/check_rabbitmq_watermark', ($args, "--node=${rabbit_servername}")]); -- 2.20.1