You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
224 lines
9.5 KiB
224 lines
9.5 KiB
7 months ago
|
From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001
|
||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
Date: Wed, 7 Jun 2023 13:18:01 -0500
|
||
|
Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3
|
||
|
/ 4`
|
||
|
Content-type: text/plain; charset=UTF-8
|
||
|
|
||
|
Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 /
|
||
|
ncores_per_socket'. This patch updates that value to roughly
|
||
|
'sizeof_L3 / 4`
|
||
|
|
||
|
The original value (specifically dividing the `ncores_per_socket`) was
|
||
|
done to limit the amount of other threads' data a `memcpy`/`memset`
|
||
|
could evict.
|
||
|
|
||
|
Dividing by 'ncores_per_socket', however leads to exceedingly low
|
||
|
non-temporal thresholds and leads to using non-temporal stores in
|
||
|
cases where REP MOVSB is multiple times faster.
|
||
|
|
||
|
Furthermore, non-temporal stores are written directly to main memory
|
||
|
so using it at a size much smaller than L3 can place soon to be
|
||
|
accessed data much further away than it otherwise could be. As well,
|
||
|
modern machines are able to detect streaming patterns (especially if
|
||
|
REP MOVSB is used) and provide LRU hints to the memory subsystem. This
|
||
|
in affect caps the total amount of eviction at 1/cache_associativity,
|
||
|
far below meaningfully thrashing the entire cache.
|
||
|
|
||
|
As best I can tell, the benchmarks that lead this small threshold
|
||
|
where done comparing non-temporal stores versus standard cacheable
|
||
|
stores. A better comparison (linked below) is to be REP MOVSB which,
|
||
|
on the measure systems, is nearly 2x faster than non-temporal stores
|
||
|
at the low-end of the previous threshold, and within 10% for over
|
||
|
100MB copies (well past even the current threshold). In cases with a
|
||
|
low number of threads competing for bandwidth, REP MOVSB is ~2x faster
|
||
|
up to `sizeof_L3`.
|
||
|
|
||
|
The divisor of `4` is a somewhat arbitrary value. From benchmarks it
|
||
|
seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs
|
||
|
such as Broadwell prefer something closer to `8`. This patch is meant
|
||
|
to be followed up by another one to make the divisor cpu-specific, but
|
||
|
in the meantime (and for easier backporting), this patch settles on
|
||
|
`4` as a middle-ground.
|
||
|
|
||
|
Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable
|
||
|
stores where done using:
|
||
|
https://github.com/goldsteinn/memcpy-nt-benchmarks
|
||
|
|
||
|
Sheets results (also available in pdf on the github):
|
||
|
https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml
|
||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||
|
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||
|
---
|
||
|
sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++---------------
|
||
|
1 file changed, 43 insertions(+), 27 deletions(-)
|
||
|
|
||
|
|
||
|
[diff rebased by DJ]
|
||
|
diff -rup a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||
|
--- a/sysdeps/x86/dl-cacheinfo.h 2023-07-25 00:38:39.386831871 -0400
|
||
|
+++ b/sysdeps/x86/dl-cacheinfo.h 2023-07-25 00:38:40.372870369 -0400
|
||
|
@@ -408,7 +408,7 @@ handle_zhaoxin (int name)
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
|
||
|
+get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
|
||
|
long int core)
|
||
|
{
|
||
|
unsigned int eax;
|
||
|
@@ -427,6 +427,7 @@ get_common_cache_info (long int *shared_
|
||
|
unsigned int family = cpu_features->basic.family;
|
||
|
unsigned int model = cpu_features->basic.model;
|
||
|
long int shared = *shared_ptr;
|
||
|
+ long int shared_per_thread = *shared_per_thread_ptr;
|
||
|
unsigned int threads = *threads_ptr;
|
||
|
bool inclusive_cache = true;
|
||
|
bool support_count_mask = true;
|
||
|
@@ -442,6 +443,7 @@ get_common_cache_info (long int *shared_
|
||
|
/* Try L2 otherwise. */
|
||
|
level = 2;
|
||
|
shared = core;
|
||
|
+ shared_per_thread = core;
|
||
|
threads_l2 = 0;
|
||
|
threads_l3 = -1;
|
||
|
}
|
||
|
@@ -598,29 +600,28 @@ get_common_cache_info (long int *shared_
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
-intel_bug_no_cache_info:
|
||
|
- /* Assume that all logical threads share the highest cache
|
||
|
- level. */
|
||
|
- threads
|
||
|
- = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
|
||
|
- & 0xff);
|
||
|
- }
|
||
|
-
|
||
|
- /* Cap usage of highest cache level to the number of supported
|
||
|
- threads. */
|
||
|
- if (shared > 0 && threads > 0)
|
||
|
- shared /= threads;
|
||
|
+ intel_bug_no_cache_info:
|
||
|
+ /* Assume that all logical threads share the highest cache
|
||
|
+ level. */
|
||
|
+ threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
|
||
|
+ & 0xff);
|
||
|
+
|
||
|
+ /* Get per-thread size of highest level cache. */
|
||
|
+ if (shared_per_thread > 0 && threads > 0)
|
||
|
+ shared_per_thread /= threads;
|
||
|
+ }
|
||
|
}
|
||
|
|
||
|
/* Account for non-inclusive L2 and L3 caches. */
|
||
|
if (!inclusive_cache)
|
||
|
{
|
||
|
if (threads_l2 > 0)
|
||
|
- core /= threads_l2;
|
||
|
+ shared_per_thread += core / threads_l2;
|
||
|
shared += core;
|
||
|
}
|
||
|
|
||
|
*shared_ptr = shared;
|
||
|
+ *shared_per_thread_ptr = shared_per_thread;
|
||
|
*threads_ptr = threads;
|
||
|
}
|
||
|
|
||
|
@@ -630,6 +631,7 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
/* Find out what brand of processor. */
|
||
|
long int data = -1;
|
||
|
long int shared = -1;
|
||
|
+ long int shared_per_thread = -1;
|
||
|
long int core = -1;
|
||
|
unsigned int threads = 0;
|
||
|
unsigned long int level1_icache_size = -1;
|
||
|
@@ -650,6 +652,7 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
|
||
|
core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||
|
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
|
||
|
+ shared_per_thread = shared;
|
||
|
|
||
|
level1_icache_size
|
||
|
= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
|
||
|
@@ -673,13 +676,14 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
level4_cache_size
|
||
|
= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
|
||
|
|
||
|
- get_common_cache_info (&shared, &threads, core);
|
||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||
|
}
|
||
|
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
|
||
|
{
|
||
|
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
|
||
|
core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||
|
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
|
||
|
+ shared_per_thread = shared;
|
||
|
|
||
|
level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
|
||
|
level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
|
||
|
@@ -693,13 +697,14 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
|
||
|
level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
|
||
|
|
||
|
- get_common_cache_info (&shared, &threads, core);
|
||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||
|
}
|
||
|
else if (cpu_features->basic.kind == arch_kind_amd)
|
||
|
{
|
||
|
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
|
||
|
core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||
|
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
|
||
|
+ shared_per_thread = shared;
|
||
|
|
||
|
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
|
||
|
level1_icache_linesize
|
||
|
@@ -721,6 +726,9 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
if (shared <= 0)
|
||
|
/* No shared L3 cache. All we have is the L2 cache. */
|
||
|
shared = core;
|
||
|
+
|
||
|
+ if (shared_per_thread <= 0)
|
||
|
+ shared_per_thread = shared;
|
||
|
}
|
||
|
|
||
|
cpu_features->level1_icache_size = level1_icache_size;
|
||
|
@@ -736,17 +744,25 @@ dl_init_cacheinfo (struct cpu_features *
|
||
|
cpu_features->level3_cache_linesize = level3_cache_linesize;
|
||
|
cpu_features->level4_cache_size = level4_cache_size;
|
||
|
|
||
|
- /* The default setting for the non_temporal threshold is 3/4 of one
|
||
|
- thread's share of the chip's cache. For most Intel and AMD processors
|
||
|
- with an initial release date between 2017 and 2020, a thread's typical
|
||
|
- share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
|
||
|
- threshold leaves 125 KBytes to 500 KBytes of the thread's data
|
||
|
- in cache after a maximum temporal copy, which will maintain
|
||
|
- in cache a reasonable portion of the thread's stack and other
|
||
|
- active data. If the threshold is set higher than one thread's
|
||
|
- share of the cache, it has a substantial risk of negatively
|
||
|
- impacting the performance of other threads running on the chip. */
|
||
|
- unsigned long int non_temporal_threshold = shared * 3 / 4;
|
||
|
+ /* The default setting for the non_temporal threshold is 1/4 of size
|
||
|
+ of the chip's cache. For most Intel and AMD processors with an
|
||
|
+ initial release date between 2017 and 2023, a thread's typical
|
||
|
+ share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
|
||
|
+ estimate the point where non-temporal stores begin out-competing
|
||
|
+ REP MOVSB. As well the point where the fact that non-temporal
|
||
|
+ stores are forced back to main memory would already occurred to the
|
||
|
+ majority of the lines in the copy. Note, concerns about the
|
||
|
+ entire L3 cache being evicted by the copy are mostly alleviated
|
||
|
+ by the fact that modern HW detects streaming patterns and
|
||
|
+ provides proper LRU hints so that the maximum thrashing
|
||
|
+ capped at 1/associativity. */
|
||
|
+ unsigned long int non_temporal_threshold = shared / 4;
|
||
|
+ /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
|
||
|
+ a higher risk of actually thrashing the cache as they don't have a HW LRU
|
||
|
+ hint. As well, their performance in highly parallel situations is
|
||
|
+ noticeably worse. */
|
||
|
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||
|
+ non_temporal_threshold = shared_per_thread * 3 / 4;
|
||
|
/* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
|
||
|
'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
|
||
|
if that operation cannot overflow. Minimum of 0x4040 (16448) because the
|
||
|
|