You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
317 lines
13 KiB
317 lines
13 KiB
commit d2123d68275acc0f061e73d5f86ca504e0d5a344 |
|
Author: Szabolcs Nagy <szabolcs.nagy@arm.com> |
|
Date: Tue Feb 16 12:55:13 2021 +0000 |
|
|
|
elf: Fix slow tls access after dlopen [BZ #19924] |
|
|
|
In short: __tls_get_addr checks the global generation counter and if |
|
the current dtv is older then _dl_update_slotinfo updates dtv up to the |
|
generation of the accessed module. So if the global generation is newer |
|
than generation of the module then __tls_get_addr keeps hitting the |
|
slow dtv update path. The dtv update path includes a number of checks |
|
to see if any update is needed and this already causes measurable tls |
|
access slow down after dlopen. |
|
|
|
It may be possible to detect up-to-date dtv faster. But if there are |
|
many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at |
|
least walking the slotinfo list. |
|
|
|
This patch tries to update the dtv to the global generation instead, so |
|
after a dlopen the tls access slow path is only hit once. The modules |
|
with larger generation than the accessed one were not necessarily |
|
synchronized before, so additional synchronization is needed. |
|
|
|
This patch uses acquire/release synchronization when accessing the |
|
generation counter. |
|
|
|
Note: in the x86_64 version of dl-tls.c the generation is only loaded |
|
once, since relaxed mo is not faster than acquire mo load. |
|
|
|
I have not benchmarked this. Tested by Adhemerval Zanella on aarch64, |
|
powerpc, sparc, x86 who reported that it fixes the performance issue |
|
of bug 19924. |
|
|
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> |
|
|
|
diff --git a/elf/dl-close.c b/elf/dl-close.c |
|
index 985cd4e2821436af..95a03c9616c6a786 100644 |
|
--- a/elf/dl-close.c |
|
+++ b/elf/dl-close.c |
|
@@ -739,7 +739,7 @@ _dl_close_worker (struct link_map *map, bool force) |
|
if (__glibc_unlikely (newgen == 0)) |
|
_dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); |
|
/* Can be read concurrently. */ |
|
- atomic_store_relaxed (&GL(dl_tls_generation), newgen); |
|
+ atomic_store_release (&GL(dl_tls_generation), newgen); |
|
|
|
if (tls_free_end == GL(dl_tls_static_used)) |
|
GL(dl_tls_static_used) = tls_free_start; |
|
diff --git a/elf/dl-open.c b/elf/dl-open.c |
|
index 633b047ad2497296..c8a5d88161441031 100644 |
|
--- a/elf/dl-open.c |
|
+++ b/elf/dl-open.c |
|
@@ -404,7 +404,7 @@ update_tls_slotinfo (struct link_map *new) |
|
_dl_fatal_printf (N_("\ |
|
TLS generation counter wrapped! Please report this.")); |
|
/* Can be read concurrently. */ |
|
- atomic_store_relaxed (&GL(dl_tls_generation), newgen); |
|
+ atomic_store_release (&GL(dl_tls_generation), newgen); |
|
|
|
/* We need a second pass for static tls data, because |
|
_dl_update_slotinfo must not be run while calls to |
|
@@ -421,8 +421,8 @@ TLS generation counter wrapped! Please report this.")); |
|
now, but we can delay updating the DTV. */ |
|
imap->l_need_tls_init = 0; |
|
#ifdef SHARED |
|
- /* Update the slot information data for at least the |
|
- generation of the DSO we are allocating data for. */ |
|
+ /* Update the slot information data for the current |
|
+ generation. */ |
|
|
|
/* FIXME: This can terminate the process on memory |
|
allocation failure. It is not possible to raise |
|
@@ -430,7 +430,7 @@ TLS generation counter wrapped! Please report this.")); |
|
_dl_update_slotinfo would have to be split into two |
|
operations, similar to resize_scopes and update_scopes |
|
above. This is related to bug 16134. */ |
|
- _dl_update_slotinfo (imap->l_tls_modid); |
|
+ _dl_update_slotinfo (imap->l_tls_modid, newgen); |
|
#endif |
|
|
|
dl_init_static_tls (imap); |
|
diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c |
|
index 5b69321bda1f2b27..be3e09e36835ed23 100644 |
|
--- a/elf/dl-reloc.c |
|
+++ b/elf/dl-reloc.c |
|
@@ -111,11 +111,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional) |
|
if (map->l_real->l_relocated) |
|
{ |
|
#ifdef SHARED |
|
+ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock) |
|
+ is held here so normal load of the generation counter is valid. */ |
|
if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation), |
|
0)) |
|
- /* Update the slot information data for at least the generation of |
|
- the DSO we are allocating data for. */ |
|
- (void) _dl_update_slotinfo (map->l_tls_modid); |
|
+ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation)); |
|
#endif |
|
|
|
dl_init_static_tls (map); |
|
diff --git a/elf/dl-tls.c b/elf/dl-tls.c |
|
index fab6546e2d31edd4..b8ada16f1637c910 100644 |
|
--- a/elf/dl-tls.c |
|
+++ b/elf/dl-tls.c |
|
@@ -721,57 +721,57 @@ allocate_and_init (struct link_map *map) |
|
|
|
|
|
struct link_map * |
|
-_dl_update_slotinfo (unsigned long int req_modid) |
|
+_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) |
|
{ |
|
struct link_map *the_map = NULL; |
|
dtv_t *dtv = THREAD_DTV (); |
|
|
|
- /* The global dl_tls_dtv_slotinfo array contains for each module |
|
- index the generation counter current when the entry was created. |
|
+ /* CONCURRENCY NOTES: |
|
+ |
|
+ The global dl_tls_dtv_slotinfo_list array contains for each module |
|
+ index the generation counter current when that entry was updated. |
|
This array never shrinks so that all module indices which were |
|
- valid at some time can be used to access it. Before the first |
|
- use of a new module index in this function the array was extended |
|
- appropriately. Access also does not have to be guarded against |
|
- modifications of the array. It is assumed that pointer-size |
|
- values can be read atomically even in SMP environments. It is |
|
- possible that other threads at the same time dynamically load |
|
- code and therefore add to the slotinfo list. This is a problem |
|
- since we must not pick up any information about incomplete work. |
|
- The solution to this is to ignore all dtv slots which were |
|
- created after the one we are currently interested. We know that |
|
- dynamic loading for this module is completed and this is the last |
|
- load operation we know finished. */ |
|
- unsigned long int idx = req_modid; |
|
+ valid at some time can be used to access it. Concurrent loading |
|
+ and unloading of modules can update slotinfo entries or extend |
|
+ the array. The updates happen under the GL(dl_load_tls_lock) and |
|
+ finish with the release store of the generation counter to |
|
+ GL(dl_tls_generation) which is synchronized with the load of |
|
+ new_gen in the caller. So updates up to new_gen are synchronized |
|
+ but updates for later generations may not be. |
|
+ |
|
+ Here we update the thread dtv from old_gen (== dtv[0].counter) to |
|
+ new_gen generation. For this, each dtv[i] entry is either set to |
|
+ an unallocated state (set), or left unmodified (nop). Where (set) |
|
+ may resize the dtv first if modid i >= dtv[-1].counter. The rules |
|
+ for the decision between (set) and (nop) are |
|
+ |
|
+ (1) If slotinfo entry i is concurrently updated then either (set) |
|
+ or (nop) is valid: TLS access cannot use dtv[i] unless it is |
|
+ synchronized with a generation > new_gen. |
|
+ |
|
+ Otherwise, if the generation of slotinfo entry i is gen and the |
|
+ loaded module for this entry is map then |
|
+ |
|
+ (2) If gen <= old_gen then do (nop). |
|
+ |
|
+ (3) If old_gen < gen <= new_gen then |
|
+ (3.1) if map != 0 then (set) |
|
+ (3.2) if map == 0 then either (set) or (nop). |
|
+ |
|
+ Note that (1) cannot be reliably detected, but since both actions |
|
+ are valid it does not have to be. Only (2) and (3.1) cases need |
|
+ to be distinguished for which relaxed mo access of gen and map is |
|
+ enough: their value is synchronized when it matters. |
|
+ |
|
+ Note that a relaxed mo load may give an out-of-thin-air value since |
|
+ it is used in decisions that can affect concurrent stores. But this |
|
+ should only happen if the OOTA value causes UB that justifies the |
|
+ concurrent store of the value. This is not expected to be an issue |
|
+ in practice. */ |
|
struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); |
|
|
|
- while (idx >= listp->len) |
|
+ if (dtv[0].counter < new_gen) |
|
{ |
|
- idx -= listp->len; |
|
- listp = listp->next; |
|
- } |
|
- |
|
- if (dtv[0].counter < listp->slotinfo[idx].gen) |
|
- { |
|
- /* CONCURRENCY NOTES: |
|
- |
|
- Here the dtv needs to be updated to new_gen generation count. |
|
- |
|
- This code may be called during TLS access when GL(dl_load_tls_lock) |
|
- is not held. In that case the user code has to synchronize with |
|
- dlopen and dlclose calls of relevant modules. A module m is |
|
- relevant if the generation of m <= new_gen and dlclose of m is |
|
- synchronized: a memory access here happens after the dlopen and |
|
- before the dlclose of relevant modules. The dtv entries for |
|
- relevant modules need to be updated, other entries can be |
|
- arbitrary. |
|
- |
|
- This e.g. means that the first part of the slotinfo list can be |
|
- accessed race free, but the tail may be concurrently extended. |
|
- Similarly relevant slotinfo entries can be read race free, but |
|
- other entries are racy. However updating a non-relevant dtv |
|
- entry does not affect correctness. For a relevant module m, |
|
- max_modid >= modid of m. */ |
|
- size_t new_gen = listp->slotinfo[idx].gen; |
|
size_t total = 0; |
|
size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); |
|
assert (max_modid >= req_modid); |
|
@@ -784,31 +784,33 @@ _dl_update_slotinfo (unsigned long int req_modid) |
|
{ |
|
size_t modid = total + cnt; |
|
|
|
- /* Later entries are not relevant. */ |
|
+ /* Case (1) for all later modids. */ |
|
if (modid > max_modid) |
|
break; |
|
|
|
size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen); |
|
|
|
+ /* Case (1). */ |
|
if (gen > new_gen) |
|
- /* Not relevant. */ |
|
continue; |
|
|
|
- /* If the entry is older than the current dtv layout we |
|
- know we don't have to handle it. */ |
|
+ /* Case (2) or (1). */ |
|
if (gen <= dtv[0].counter) |
|
continue; |
|
|
|
+ /* Case (3) or (1). */ |
|
+ |
|
/* If there is no map this means the entry is empty. */ |
|
struct link_map *map |
|
= atomic_load_relaxed (&listp->slotinfo[cnt].map); |
|
/* Check whether the current dtv array is large enough. */ |
|
if (dtv[-1].counter < modid) |
|
{ |
|
+ /* Case (3.2) or (1). */ |
|
if (map == NULL) |
|
continue; |
|
|
|
- /* Resize the dtv. */ |
|
+ /* Resizing the dtv aborts on failure: bug 16134. */ |
|
dtv = _dl_resize_dtv (dtv, max_modid); |
|
|
|
assert (modid <= dtv[-1].counter); |
|
@@ -819,7 +821,7 @@ _dl_update_slotinfo (unsigned long int req_modid) |
|
} |
|
|
|
/* If there is currently memory allocate for this |
|
- dtv entry free it. */ |
|
+ dtv entry free it. Note: this is not AS-safe. */ |
|
/* XXX Ideally we will at some point create a memory |
|
pool. */ |
|
free (dtv[modid].pointer.to_free); |
|
@@ -914,9 +916,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map) |
|
|
|
static struct link_map * |
|
__attribute_noinline__ |
|
-update_get_addr (GET_ADDR_ARGS) |
|
+update_get_addr (GET_ADDR_ARGS, size_t gen) |
|
{ |
|
- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); |
|
+ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); |
|
dtv_t *dtv = THREAD_DTV (); |
|
|
|
void *p = dtv[GET_ADDR_MODULE].pointer.val; |
|
@@ -946,12 +948,17 @@ __tls_get_addr (GET_ADDR_ARGS) |
|
dtv_t *dtv = THREAD_DTV (); |
|
|
|
/* Update is needed if dtv[0].counter < the generation of the accessed |
|
- module. The global generation counter is used here as it is easier |
|
- to check. Synchronization for the relaxed MO access is guaranteed |
|
- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ |
|
+ module, but the global generation counter is easier to check (which |
|
+ must be synchronized up to the generation of the accessed module by |
|
+ user code doing the TLS access so relaxed mo read is enough). */ |
|
size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); |
|
if (__glibc_unlikely (dtv[0].counter != gen)) |
|
- return update_get_addr (GET_ADDR_PARAM); |
|
+ { |
|
+ /* Update DTV up to the global generation, see CONCURRENCY NOTES |
|
+ in _dl_update_slotinfo. */ |
|
+ gen = atomic_load_acquire (&GL(dl_tls_generation)); |
|
+ return update_get_addr (GET_ADDR_PARAM, gen); |
|
+ } |
|
|
|
void *p = dtv[GET_ADDR_MODULE].pointer.val; |
|
|
|
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h |
|
index 8c0fe98f69a88f1e..7964e133e4930e88 100644 |
|
--- a/sysdeps/generic/ldsodefs.h |
|
+++ b/sysdeps/generic/ldsodefs.h |
|
@@ -1304,7 +1304,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add) |
|
|
|
/* Update slot information data for at least the generation of the |
|
module with the given index. */ |
|
-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid) |
|
+extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, |
|
+ size_t gen) |
|
attribute_hidden; |
|
|
|
/* Look up the module's TLS block as for __tls_get_addr, |
|
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c |
|
index 24ef560b718275a2..4ded8dd6b94edc81 100644 |
|
--- a/sysdeps/x86_64/dl-tls.c |
|
+++ b/sysdeps/x86_64/dl-tls.c |
|
@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS) |
|
{ |
|
dtv_t *dtv = THREAD_DTV (); |
|
|
|
- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); |
|
+ size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); |
|
if (__glibc_unlikely (dtv[0].counter != gen)) |
|
- return update_get_addr (GET_ADDR_PARAM); |
|
+ return update_get_addr (GET_ADDR_PARAM, gen); |
|
|
|
return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); |
|
}
|
|
|