You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
5.7 KiB
185 lines
5.7 KiB
From 738577dd30b782104057496bf01f09e28216892b Mon Sep 17 00:00:00 2001 |
|
From: Dejan Muhamedagic <dejan@hello-penguin.com> |
|
Date: Mon, 26 Jun 2017 15:56:01 +0200 |
|
Subject: [PATCH 1/2] Medium: ocf-shellfuncs: improve locking (ocf_take_lock) |
|
|
|
This change improves locking by ocf_take_lock(). It uses mkdir(1) |
|
to prevent two instances from creating the same directory (named |
|
by the lock). |
|
|
|
The major difficulty is to prevent a race when a stale lock is |
|
discovered. If two processes try to remove the stale lock at |
|
about the same time, the one which runs slightly later can remove |
|
the lock which just got created by the one which run slightly |
|
earlier. The probability of this race is significantly reduced by |
|
testing for stale lock twice with a random sleep in between. |
|
|
|
Though this change does not exclude a race entirely, it makes it |
|
extremely improbable. In addition, stale locks are result of only |
|
abnormal circumstances and occur seldom. |
|
|
|
The function providing random numbers has been modified to use |
|
either /dev/urandom or awk (with the process pid as the seed). |
|
|
|
It was thoroughly tested with both stale lock simulation and |
|
without, by running 64 instances of processes trying to get the |
|
lock on a workstation with 4 cpus. |
|
--- |
|
heartbeat/ocf-shellfuncs.in | 74 ++++++++++++++++++++++++++++++++++----------- |
|
1 file changed, 57 insertions(+), 17 deletions(-) |
|
|
|
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in |
|
index ebc221d5f..615f5b4b8 100644 |
|
--- a/heartbeat/ocf-shellfuncs.in |
|
+++ b/heartbeat/ocf-shellfuncs.in |
|
@@ -72,10 +72,11 @@ ocf_is_root() { |
|
} |
|
|
|
ocf_maybe_random() { |
|
- local rnd="$RANDOM" |
|
- # Something sane-ish in case a shell doesn't support $RANDOM |
|
- [ -n "$rnd" ] || rnd=$$ |
|
- echo $rnd |
|
+ if test -c /dev/urandom; then |
|
+ od -An -N4 -tu4 /dev/urandom | tr -d '[:space:]' |
|
+ else |
|
+ awk -v pid=$$ 'BEGIN{srand(pid); print rand()}' | sed 's/^.*[.]//' |
|
+ fi |
|
} |
|
|
|
# Portability comments: |
|
@@ -465,24 +466,63 @@ ocf_pidfile_status() { |
|
return 1 |
|
} |
|
|
|
-ocf_take_lock() { |
|
- local lockfile=$1 |
|
- local rnd=$(ocf_maybe_random) |
|
+# mkdir(1) based locking |
|
+# first the directory is created with the name given as $1 |
|
+# then a file named "pid" is created within that directory with |
|
+# the process PID |
|
|
|
- sleep 0.$rnd |
|
- while |
|
- ocf_pidfile_status $lockfile |
|
- do |
|
- ocf_log info "Sleeping until $lockfile is released..." |
|
- sleep 0.$rnd |
|
- done |
|
- echo $$ > $lockfile |
|
+ocf_get_stale_pid() { |
|
+ local piddir=$1 |
|
+ local pid |
|
+ [ -z "$piddir" ] && return 2 |
|
+ pid=`cat $piddir/pid 2>/dev/null` |
|
+ [ -z "$pid" ] && return 1 # no process |
|
+ kill -0 $pid >/dev/null 2>&1 && return 1 # not stale |
|
+ echo $pid |
|
} |
|
|
|
+# There is a race when the following two functions to manage the |
|
+# lock file (mk and rm) are invoked in parallel by different |
|
+# instances. It is up to the caller to reduce probability of that |
|
+# taking place (see ocf_take_lock() below). |
|
+ |
|
+ocf_mk_pid() { |
|
+ mkdir $1 2>/dev/null && echo $$ > $1/pid |
|
+} |
|
+ocf_rm_pid() { |
|
+ rm -f $1/pid |
|
+ rmdir $1 2>/dev/null |
|
+} |
|
+ |
|
+# Testing and subsequently removing a stale lock (containing the |
|
+# process pid) is inherently difficult to do in such a way as to |
|
+# prevent a race between creating a pid file and removing it and |
|
+# its directory. We reduce the probability of that happening by |
|
+# checking if the stale lock persists over a random period of |
|
+# time. |
|
+ |
|
+ocf_take_lock() { |
|
+ local lockdir=$1 |
|
+ local rnd |
|
+ local stale_pid |
|
+ |
|
+ # we don't want it too short, so strip leading zeros |
|
+ rnd=$(ocf_maybe_random | sed 's/^0*//') |
|
+ stale_pid=`ocf_get_stale_pid $lockdir` |
|
+ if [ -n "$stale_pid" ]; then |
|
+ sleep 0.$rnd |
|
+ # remove "stale pid" only if it persists |
|
+ [ "$stale_pid" = "`ocf_get_stale_pid $lockdir`" ] && |
|
+ ocf_rm_pid $lockdir |
|
+ fi |
|
+ while ! ocf_mk_pid $lockdir; do |
|
+ ocf_log info "Sleeping until $lockdir is released..." |
|
+ sleep 0.$rnd |
|
+ done |
|
+} |
|
|
|
ocf_release_lock_on_exit() { |
|
- local lockfile=$1 |
|
- trap "rm -f $lockfile" EXIT |
|
+ trap "ocf_rm_pid $1" EXIT |
|
} |
|
|
|
# returns true if the CRM is currently running a probe. A probe is |
|
|
|
From 46e6f1d0e736e68c7a48c94083d7037e590365b4 Mon Sep 17 00:00:00 2001 |
|
From: Dejan Muhamedagic <dejan@hello-penguin.com> |
|
Date: Mon, 26 Jun 2017 20:29:06 +0200 |
|
Subject: [PATCH 2/2] Dev: ocf-shellfuncs: handle empty lock directories |
|
|
|
--- |
|
heartbeat/ocf-shellfuncs.in | 34 ++++++++++++++++++++++++++++------ |
|
1 file changed, 28 insertions(+), 6 deletions(-) |
|
|
|
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in |
|
index 615f5b4b8..817b2a557 100644 |
|
--- a/heartbeat/ocf-shellfuncs.in |
|
+++ b/heartbeat/ocf-shellfuncs.in |
|
@@ -470,15 +470,37 @@ ocf_pidfile_status() { |
|
# first the directory is created with the name given as $1 |
|
# then a file named "pid" is created within that directory with |
|
# the process PID |
|
- |
|
+# stale locks are handled carefully, the inode of a directory |
|
+# needs to match before and after test if the process is running |
|
+# empty directories are also handled appropriately |
|
+# we relax (sleep) occasionally to allow for other processes to |
|
+# finish managing the lock in case they are in the middle of the |
|
+# business |
|
+ |
|
+relax() { sleep 0.5; } |
|
ocf_get_stale_pid() { |
|
- local piddir=$1 |
|
- local pid |
|
+ local piddir pid dir_inode |
|
+ |
|
+ piddir="$1" |
|
[ -z "$piddir" ] && return 2 |
|
+ dir_inode="`ls -di $piddir 2>/dev/null`" |
|
+ [ -z "$dir_inode" ] && return 1 |
|
pid=`cat $piddir/pid 2>/dev/null` |
|
- [ -z "$pid" ] && return 1 # no process |
|
- kill -0 $pid >/dev/null 2>&1 && return 1 # not stale |
|
- echo $pid |
|
+ if [ -z "$pid" ]; then |
|
+ # empty directory? |
|
+ relax |
|
+ if [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then |
|
+ echo $dir_inode |
|
+ else |
|
+ return 1 |
|
+ fi |
|
+ elif kill -0 $pid >/dev/null 2>&1; then |
|
+ return 1 |
|
+ elif relax && [ -e "$piddir/pid" ] && [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then |
|
+ echo $pid |
|
+ else |
|
+ return 1 |
|
+ fi |
|
} |
|
|
|
# There is a race when the following two functions to manage the
|
|
|