You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
186 lines
5.7 KiB
186 lines
5.7 KiB
7 years ago
|
From 738577dd30b782104057496bf01f09e28216892b Mon Sep 17 00:00:00 2001
|
||
|
From: Dejan Muhamedagic <dejan@hello-penguin.com>
|
||
|
Date: Mon, 26 Jun 2017 15:56:01 +0200
|
||
|
Subject: [PATCH 1/2] Medium: ocf-shellfuncs: improve locking (ocf_take_lock)
|
||
|
|
||
|
This change improves locking by ocf_take_lock(). It uses mkdir(1)
|
||
|
to prevent two instances from creating the same directory (named
|
||
|
by the lock).
|
||
|
|
||
|
The major difficulty is to prevent a race when a stale lock is
|
||
|
discovered. If two processes try to remove the stale lock at
|
||
|
about the same time, the one which runs slightly later can remove
|
||
|
the lock which just got created by the one which run slightly
|
||
|
earlier. The probability of this race is significantly reduced by
|
||
|
testing for stale lock twice with a random sleep in between.
|
||
|
|
||
|
Though this change does not exclude a race entirely, it makes it
|
||
|
extremely improbable. In addition, stale locks are result of only
|
||
|
abnormal circumstances and occur seldom.
|
||
|
|
||
|
The function providing random numbers has been modified to use
|
||
|
either /dev/urandom or awk (with the process pid as the seed).
|
||
|
|
||
|
It was thoroughly tested with both stale lock simulation and
|
||
|
without, by running 64 instances of processes trying to get the
|
||
|
lock on a workstation with 4 cpus.
|
||
|
---
|
||
|
heartbeat/ocf-shellfuncs.in | 74 ++++++++++++++++++++++++++++++++++-----------
|
||
|
1 file changed, 57 insertions(+), 17 deletions(-)
|
||
|
|
||
|
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||
|
index ebc221d5f..615f5b4b8 100644
|
||
|
--- a/heartbeat/ocf-shellfuncs.in
|
||
|
+++ b/heartbeat/ocf-shellfuncs.in
|
||
|
@@ -72,10 +72,11 @@ ocf_is_root() {
|
||
|
}
|
||
|
|
||
|
ocf_maybe_random() {
|
||
|
- local rnd="$RANDOM"
|
||
|
- # Something sane-ish in case a shell doesn't support $RANDOM
|
||
|
- [ -n "$rnd" ] || rnd=$$
|
||
|
- echo $rnd
|
||
|
+ if test -c /dev/urandom; then
|
||
|
+ od -An -N4 -tu4 /dev/urandom | tr -d '[:space:]'
|
||
|
+ else
|
||
|
+ awk -v pid=$$ 'BEGIN{srand(pid); print rand()}' | sed 's/^.*[.]//'
|
||
|
+ fi
|
||
|
}
|
||
|
|
||
|
# Portability comments:
|
||
|
@@ -465,24 +466,63 @@ ocf_pidfile_status() {
|
||
|
return 1
|
||
|
}
|
||
|
|
||
|
-ocf_take_lock() {
|
||
|
- local lockfile=$1
|
||
|
- local rnd=$(ocf_maybe_random)
|
||
|
+# mkdir(1) based locking
|
||
|
+# first the directory is created with the name given as $1
|
||
|
+# then a file named "pid" is created within that directory with
|
||
|
+# the process PID
|
||
|
|
||
|
- sleep 0.$rnd
|
||
|
- while
|
||
|
- ocf_pidfile_status $lockfile
|
||
|
- do
|
||
|
- ocf_log info "Sleeping until $lockfile is released..."
|
||
|
- sleep 0.$rnd
|
||
|
- done
|
||
|
- echo $$ > $lockfile
|
||
|
+ocf_get_stale_pid() {
|
||
|
+ local piddir=$1
|
||
|
+ local pid
|
||
|
+ [ -z "$piddir" ] && return 2
|
||
|
+ pid=`cat $piddir/pid 2>/dev/null`
|
||
|
+ [ -z "$pid" ] && return 1 # no process
|
||
|
+ kill -0 $pid >/dev/null 2>&1 && return 1 # not stale
|
||
|
+ echo $pid
|
||
|
}
|
||
|
|
||
|
+# There is a race when the following two functions to manage the
|
||
|
+# lock file (mk and rm) are invoked in parallel by different
|
||
|
+# instances. It is up to the caller to reduce probability of that
|
||
|
+# taking place (see ocf_take_lock() below).
|
||
|
+
|
||
|
+ocf_mk_pid() {
|
||
|
+ mkdir $1 2>/dev/null && echo $$ > $1/pid
|
||
|
+}
|
||
|
+ocf_rm_pid() {
|
||
|
+ rm -f $1/pid
|
||
|
+ rmdir $1 2>/dev/null
|
||
|
+}
|
||
|
+
|
||
|
+# Testing and subsequently removing a stale lock (containing the
|
||
|
+# process pid) is inherently difficult to do in such a way as to
|
||
|
+# prevent a race between creating a pid file and removing it and
|
||
|
+# its directory. We reduce the probability of that happening by
|
||
|
+# checking if the stale lock persists over a random period of
|
||
|
+# time.
|
||
|
+
|
||
|
+ocf_take_lock() {
|
||
|
+ local lockdir=$1
|
||
|
+ local rnd
|
||
|
+ local stale_pid
|
||
|
+
|
||
|
+ # we don't want it too short, so strip leading zeros
|
||
|
+ rnd=$(ocf_maybe_random | sed 's/^0*//')
|
||
|
+ stale_pid=`ocf_get_stale_pid $lockdir`
|
||
|
+ if [ -n "$stale_pid" ]; then
|
||
|
+ sleep 0.$rnd
|
||
|
+ # remove "stale pid" only if it persists
|
||
|
+ [ "$stale_pid" = "`ocf_get_stale_pid $lockdir`" ] &&
|
||
|
+ ocf_rm_pid $lockdir
|
||
|
+ fi
|
||
|
+ while ! ocf_mk_pid $lockdir; do
|
||
|
+ ocf_log info "Sleeping until $lockdir is released..."
|
||
|
+ sleep 0.$rnd
|
||
|
+ done
|
||
|
+}
|
||
|
|
||
|
ocf_release_lock_on_exit() {
|
||
|
- local lockfile=$1
|
||
|
- trap "rm -f $lockfile" EXIT
|
||
|
+ trap "ocf_rm_pid $1" EXIT
|
||
|
}
|
||
|
|
||
|
# returns true if the CRM is currently running a probe. A probe is
|
||
|
|
||
|
From 46e6f1d0e736e68c7a48c94083d7037e590365b4 Mon Sep 17 00:00:00 2001
|
||
|
From: Dejan Muhamedagic <dejan@hello-penguin.com>
|
||
|
Date: Mon, 26 Jun 2017 20:29:06 +0200
|
||
|
Subject: [PATCH 2/2] Dev: ocf-shellfuncs: handle empty lock directories
|
||
|
|
||
|
---
|
||
|
heartbeat/ocf-shellfuncs.in | 34 ++++++++++++++++++++++++++++------
|
||
|
1 file changed, 28 insertions(+), 6 deletions(-)
|
||
|
|
||
|
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||
|
index 615f5b4b8..817b2a557 100644
|
||
|
--- a/heartbeat/ocf-shellfuncs.in
|
||
|
+++ b/heartbeat/ocf-shellfuncs.in
|
||
|
@@ -470,15 +470,37 @@ ocf_pidfile_status() {
|
||
|
# first the directory is created with the name given as $1
|
||
|
# then a file named "pid" is created within that directory with
|
||
|
# the process PID
|
||
|
-
|
||
|
+# stale locks are handled carefully, the inode of a directory
|
||
|
+# needs to match before and after test if the process is running
|
||
|
+# empty directories are also handled appropriately
|
||
|
+# we relax (sleep) occasionally to allow for other processes to
|
||
|
+# finish managing the lock in case they are in the middle of the
|
||
|
+# business
|
||
|
+
|
||
|
+relax() { sleep 0.5; }
|
||
|
ocf_get_stale_pid() {
|
||
|
- local piddir=$1
|
||
|
- local pid
|
||
|
+ local piddir pid dir_inode
|
||
|
+
|
||
|
+ piddir="$1"
|
||
|
[ -z "$piddir" ] && return 2
|
||
|
+ dir_inode="`ls -di $piddir 2>/dev/null`"
|
||
|
+ [ -z "$dir_inode" ] && return 1
|
||
|
pid=`cat $piddir/pid 2>/dev/null`
|
||
|
- [ -z "$pid" ] && return 1 # no process
|
||
|
- kill -0 $pid >/dev/null 2>&1 && return 1 # not stale
|
||
|
- echo $pid
|
||
|
+ if [ -z "$pid" ]; then
|
||
|
+ # empty directory?
|
||
|
+ relax
|
||
|
+ if [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then
|
||
|
+ echo $dir_inode
|
||
|
+ else
|
||
|
+ return 1
|
||
|
+ fi
|
||
|
+ elif kill -0 $pid >/dev/null 2>&1; then
|
||
|
+ return 1
|
||
|
+ elif relax && [ -e "$piddir/pid" ] && [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then
|
||
|
+ echo $pid
|
||
|
+ else
|
||
|
+ return 1
|
||
|
+ fi
|
||
|
}
|
||
|
|
||
|
# There is a race when the following two functions to manage the
|