You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
356 lines
13 KiB
356 lines
13 KiB
commit 82a707aeb74f23bb1783af0f9e93790a2038ff7e |
|
Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> |
|
Date: Mon Jun 6 12:17:43 2022 -0700 |
|
|
|
x86_64: Add strstr function with 512-bit EVEX |
|
|
|
Adding a 512-bit EVEX version of strstr. The algorithm works as follows: |
|
|
|
(1) We spend a few cycles at the begining to peek into the needle. We |
|
locate an edge in the needle (first occurance of 2 consequent distinct |
|
characters) and also store the first 64-bytes into a zmm register. |
|
|
|
(2) We search for the edge in the haystack by looking into one cache |
|
line of the haystack at a time. This avoids having to read past a page |
|
boundary which can cause a seg fault. |
|
|
|
(3) If an edge is found in the haystack we first compare the first |
|
64-bytes of the needle (already stored in a zmm register) before we |
|
proceed with a full string compare performed byte by byte. |
|
|
|
Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) |
|
|
|
Geometric mean of all benchmarks: new / old = 0.66 |
|
|
|
Difficult skiptable(0) : new / old = 0.02 |
|
Difficult skiptable(1) : new / old = 0.01 |
|
Difficult 2-way : new / old = 0.25 |
|
Difficult testing first 2 : new / old = 1.26 |
|
Difficult skiptable(0) : new / old = 0.05 |
|
Difficult skiptable(1) : new / old = 0.06 |
|
Difficult 2-way : new / old = 0.26 |
|
Difficult testing first 2 : new / old = 1.05 |
|
Difficult skiptable(0) : new / old = 0.42 |
|
Difficult skiptable(1) : new / old = 0.24 |
|
Difficult 2-way : new / old = 0.21 |
|
Difficult testing first 2 : new / old = 1.04 |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c) |
|
|
|
x86: Remove __mmask intrinsics in strstr-avx512.c |
|
|
|
The intrinsics are not available before GCC7 and using standard |
|
operators generates code of equivalent or better quality. |
|
|
|
Removed: |
|
_cvtmask64_u64 |
|
_kshiftri_mask64 |
|
_kand_mask64 |
|
|
|
Geometric Mean of 5 Runs of Full Benchmark Suite New / Old: 0.958 |
|
|
|
(cherry picked from commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile |
|
index 4d4ad2a3686b5bc3..0e39e63ef6be6a86 100644 |
|
--- a/sysdeps/x86_64/multiarch/Makefile |
|
+++ b/sysdeps/x86_64/multiarch/Makefile |
|
@@ -126,6 +126,7 @@ sysdep_routines += \ |
|
strrchr-sse2 \ |
|
strspn-c \ |
|
strspn-sse2 \ |
|
+ strstr-avx512 \ |
|
strstr-sse2-unaligned \ |
|
varshift \ |
|
# sysdep_routines |
|
@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 |
|
CFLAGS-strcspn-c.c += -msse4 |
|
CFLAGS-strpbrk-c.c += -msse4 |
|
CFLAGS-strspn-c.c += -msse4 |
|
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 |
|
endif |
|
|
|
ifeq ($(subdir),wcsmbs) |
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c |
|
index 6b75a7106e174bce..043821278fdb6d8f 100644 |
|
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c |
|
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c |
|
@@ -633,6 +633,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
|
|
|
/* Support sysdeps/x86_64/multiarch/strstr.c. */ |
|
IFUNC_IMPL (i, name, strstr, |
|
+ IFUNC_IMPL_ADD (array, i, strstr, |
|
+ (CPU_FEATURE_USABLE (AVX512VL) |
|
+ && CPU_FEATURE_USABLE (AVX512BW) |
|
+ && CPU_FEATURE_USABLE (AVX512DQ) |
|
+ && CPU_FEATURE_USABLE (BMI2)), |
|
+ __strstr_avx512) |
|
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) |
|
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c |
|
new file mode 100644 |
|
index 0000000000000000..e44c1a05dc0007e5 |
|
--- /dev/null |
|
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c |
|
@@ -0,0 +1,218 @@ |
|
+/* strstr optimized with 512-bit AVX-512 instructions |
|
+ Copyright (C) 2022 Free Software Foundation, Inc. |
|
+ This file is part of the GNU C Library. |
|
+ |
|
+ The GNU C Library is free software; you can redistribute it and/or |
|
+ modify it under the terms of the GNU Lesser General Public |
|
+ License as published by the Free Software Foundation; either |
|
+ version 2.1 of the License, or (at your option) any later version. |
|
+ |
|
+ The GNU C Library is distributed in the hope that it will be useful, |
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
+ Lesser General Public License for more details. |
|
+ |
|
+ You should have received a copy of the GNU Lesser General Public |
|
+ License along with the GNU C Library; if not, see |
|
+ <https://www.gnu.org/licenses/>. */ |
|
+ |
|
+#include <immintrin.h> |
|
+#include <inttypes.h> |
|
+#include <stdbool.h> |
|
+#include <string.h> |
|
+ |
|
+#define FULL_MMASK64 0xffffffffffffffff |
|
+#define ONE_64BIT 0x1ull |
|
+#define ZMM_SIZE_IN_BYTES 64 |
|
+#define PAGESIZE 4096 |
|
+ |
|
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__) |
|
+#define kshiftri_mask64(x, y) ((x) >> (y)) |
|
+#define kand_mask64(x, y) ((x) & (y)) |
|
+ |
|
+/* |
|
+ Returns the index of the first edge within the needle, returns 0 if no edge |
|
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' |
|
+ */ |
|
+static inline size_t |
|
+find_edge_in_needle (const char *ned) |
|
+{ |
|
+ size_t ind = 0; |
|
+ while (ned[ind + 1] != '\0') |
|
+ { |
|
+ if (ned[ind] != ned[ind + 1]) |
|
+ return ind; |
|
+ else |
|
+ ind = ind + 1; |
|
+ } |
|
+ return 0; |
|
+} |
|
+ |
|
+/* |
|
+ Compare needle with haystack byte by byte at specified location |
|
+ */ |
|
+static inline bool |
|
+verify_string_match (const char *hay, const size_t hay_index, const char *ned, |
|
+ size_t ind) |
|
+{ |
|
+ while (ned[ind] != '\0') |
|
+ { |
|
+ if (ned[ind] != hay[hay_index + ind]) |
|
+ return false; |
|
+ ind = ind + 1; |
|
+ } |
|
+ return true; |
|
+} |
|
+ |
|
+/* |
|
+ Compare needle with haystack at specified location. The first 64 bytes are |
|
+ compared using a ZMM register. |
|
+ */ |
|
+static inline bool |
|
+verify_string_match_avx512 (const char *hay, const size_t hay_index, |
|
+ const char *ned, const __mmask64 ned_mask, |
|
+ const __m512i ned_zmm) |
|
+{ |
|
+ /* check first 64 bytes using zmm and then scalar */ |
|
+ __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so |
|
+ __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); |
|
+ if (match != 0x0) // failed the first few chars |
|
+ return false; |
|
+ else if (ned_mask == FULL_MMASK64) |
|
+ return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); |
|
+ return true; |
|
+} |
|
+ |
|
+char * |
|
+__strstr_avx512 (const char *haystack, const char *ned) |
|
+{ |
|
+ char first = ned[0]; |
|
+ if (first == '\0') |
|
+ return (char *)haystack; |
|
+ if (ned[1] == '\0') |
|
+ return (char *)strchr (haystack, ned[0]); |
|
+ |
|
+ size_t edge = find_edge_in_needle (ned); |
|
+ |
|
+ /* ensure haystack is as long as the pos of edge in needle */ |
|
+ for (int ii = 0; ii < edge; ++ii) |
|
+ { |
|
+ if (haystack[ii] == '\0') |
|
+ return NULL; |
|
+ } |
|
+ |
|
+ /* |
|
+ Load 64 bytes of the needle and save it to a zmm register |
|
+ Read one cache line at a time to avoid loading across a page boundary |
|
+ */ |
|
+ __mmask64 ned_load_mask = _bzhi_u64 ( |
|
+ FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); |
|
+ __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); |
|
+ __mmask64 ned_nullmask |
|
+ = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); |
|
+ |
|
+ if (__glibc_unlikely (ned_nullmask == 0x0)) |
|
+ { |
|
+ ned_zmm = _mm512_loadu_si512 (ned); |
|
+ ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); |
|
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); |
|
+ if (ned_nullmask != 0x0) |
|
+ ned_load_mask = ned_load_mask >> 1; |
|
+ } |
|
+ else |
|
+ { |
|
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); |
|
+ ned_load_mask = ned_load_mask >> 1; |
|
+ } |
|
+ const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); |
|
+ const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); |
|
+ |
|
+ /* |
|
+ Read the bytes of haystack in the current cache line |
|
+ */ |
|
+ size_t hay_index = edge; |
|
+ __mmask64 loadmask = _bzhi_u64 ( |
|
+ FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); |
|
+ /* First load is a partial cache line */ |
|
+ __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); |
|
+ /* Search for NULL and compare only till null char */ |
|
+ uint64_t nullmask |
|
+ = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); |
|
+ uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); |
|
+ cmpmask = cmpmask & cvtmask64_u64 (loadmask); |
|
+ /* Search for the 2 charaters of needle */ |
|
+ __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); |
|
+ __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); |
|
+ k1 = kshiftri_mask64 (k1, 1); |
|
+ /* k2 masks tell us if both chars from needle match */ |
|
+ uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; |
|
+ /* For every match, search for the entire needle for a full match */ |
|
+ while (k2) |
|
+ { |
|
+ uint64_t bitcount = _tzcnt_u64 (k2); |
|
+ k2 = _blsr_u64 (k2); |
|
+ size_t match_pos = hay_index + bitcount - edge; |
|
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) |
|
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) |
|
+ { |
|
+ /* |
|
+ * Use vector compare as long as you are not crossing a page |
|
+ */ |
|
+ if (verify_string_match_avx512 (haystack, match_pos, ned, |
|
+ ned_load_mask, ned_zmm)) |
|
+ return (char *)haystack + match_pos; |
|
+ } |
|
+ else |
|
+ { |
|
+ if (verify_string_match (haystack, match_pos, ned, 0)) |
|
+ return (char *)haystack + match_pos; |
|
+ } |
|
+ } |
|
+ /* We haven't checked for potential match at the last char yet */ |
|
+ haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); |
|
+ hay_index = 0; |
|
+ |
|
+ /* |
|
+ Loop over one cache line at a time to prevent reading over page |
|
+ boundary |
|
+ */ |
|
+ __m512i hay1; |
|
+ while (nullmask == 0) |
|
+ { |
|
+ hay0 = _mm512_loadu_si512 (haystack + hay_index); |
|
+ hay1 = _mm512_load_si512 (haystack + hay_index |
|
+ + 1); // Always 64 byte aligned |
|
+ nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); |
|
+ /* Compare only till null char */ |
|
+ cmpmask = nullmask ^ (nullmask - ONE_64BIT); |
|
+ k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); |
|
+ k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); |
|
+ /* k2 masks tell us if both chars from needle match */ |
|
+ k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; |
|
+ /* For every match, compare full strings for potential match */ |
|
+ while (k2) |
|
+ { |
|
+ uint64_t bitcount = _tzcnt_u64 (k2); |
|
+ k2 = _blsr_u64 (k2); |
|
+ size_t match_pos = hay_index + bitcount - edge; |
|
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) |
|
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) |
|
+ { |
|
+ /* |
|
+ * Use vector compare as long as you are not crossing a page |
|
+ */ |
|
+ if (verify_string_match_avx512 (haystack, match_pos, ned, |
|
+ ned_load_mask, ned_zmm)) |
|
+ return (char *)haystack + match_pos; |
|
+ } |
|
+ else |
|
+ { |
|
+ /* Compare byte by byte */ |
|
+ if (verify_string_match (haystack, match_pos, ned, 0)) |
|
+ return (char *)haystack + match_pos; |
|
+ } |
|
+ } |
|
+ hay_index += ZMM_SIZE_IN_BYTES; |
|
+ } |
|
+ return NULL; |
|
+} |
|
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c |
|
index 848601bde7583ca3..9474d6234e9b62d3 100644 |
|
--- a/sysdeps/x86_64/multiarch/strstr.c |
|
+++ b/sysdeps/x86_64/multiarch/strstr.c |
|
@@ -35,16 +35,32 @@ |
|
|
|
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; |
|
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; |
|
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; |
|
|
|
#include "init-arch.h" |
|
|
|
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle |
|
ifunc symbol properly. */ |
|
extern __typeof (__redirect_strstr) __libc_strstr; |
|
-libc_ifunc (__libc_strstr, |
|
- HAS_ARCH_FEATURE (Fast_Unaligned_Load) |
|
- ? __strstr_sse2_unaligned |
|
- : __strstr_sse2) |
|
|
|
+static inline void * |
|
+IFUNC_SELECTOR (void) |
|
+{ |
|
+ const struct cpu_features *cpu_features = __get_cpu_features (); |
|
+ |
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) |
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) |
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) |
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) |
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) |
|
+ return __strstr_avx512; |
|
+ |
|
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) |
|
+ return __strstr_sse2_unaligned; |
|
+ |
|
+ return __strstr_sse2; |
|
+} |
|
+ |
|
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); |
|
#undef strstr |
|
strong_alias (__libc_strstr, strstr)
|
|
|