You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
4.5 KiB
143 lines
4.5 KiB
commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd |
|
Author: Noah Goldstein <goldstein.w.n@gmail.com> |
|
Date: Wed Mar 23 16:57:24 2022 -0500 |
|
|
|
x86: Optimize strspn in strspn-c.c |
|
|
|
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of |
|
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary |
|
sign extensions. |
|
|
|
geometric_mean(N=20) of all benchmarks that dont fallback on |
|
sse2; New / Original: .901 |
|
|
|
All string/memory tests pass. |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c |
|
index a17196296b9ebe52..3bcc479f1b52ff6a 100644 |
|
--- a/sysdeps/x86_64/multiarch/strspn-c.c |
|
+++ b/sysdeps/x86_64/multiarch/strspn-c.c |
|
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) |
|
return 0; |
|
|
|
const char *aligned; |
|
- __m128i mask; |
|
- int offset = (int) ((size_t) a & 15); |
|
+ __m128i mask, maskz, zero; |
|
+ unsigned int maskz_bits; |
|
+ unsigned int offset = (int) ((size_t) a & 15); |
|
+ zero = _mm_set1_epi8 (0); |
|
if (offset != 0) |
|
{ |
|
/* Load masks. */ |
|
aligned = (const char *) ((size_t) a & -16L); |
|
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned); |
|
- |
|
- mask = __m128i_shift_right (mask0, offset); |
|
+ maskz = _mm_cmpeq_epi8 (mask0, zero); |
|
|
|
/* Find where the NULL terminator is. */ |
|
- int length = _mm_cmpistri (mask, mask, 0x3a); |
|
- if (length == 16 - offset) |
|
- { |
|
- /* There is no NULL terminator. */ |
|
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); |
|
- int index = _mm_cmpistri (mask1, mask1, 0x3a); |
|
- length += index; |
|
- |
|
- /* Don't use SSE4.2 if the length of A > 16. */ |
|
- if (length > 16) |
|
- return __strspn_sse2 (s, a); |
|
- |
|
- if (index != 0) |
|
- { |
|
- /* Combine mask0 and mask1. We could play games with |
|
- palignr, but frankly this data should be in L1 now |
|
- so do the merge via an unaligned load. */ |
|
- mask = _mm_loadu_si128 ((__m128i *) a); |
|
- } |
|
- } |
|
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; |
|
+ if (maskz_bits != 0) |
|
+ { |
|
+ mask = __m128i_shift_right (mask0, offset); |
|
+ offset = (unsigned int) ((size_t) s & 15); |
|
+ if (offset) |
|
+ goto start_unaligned; |
|
+ |
|
+ aligned = s; |
|
+ goto start_loop; |
|
+ } |
|
} |
|
- else |
|
- { |
|
- /* A is aligned. */ |
|
- mask = _mm_load_si128 ((__m128i *) a); |
|
|
|
- /* Find where the NULL terminator is. */ |
|
- int length = _mm_cmpistri (mask, mask, 0x3a); |
|
- if (length == 16) |
|
- { |
|
- /* There is no NULL terminator. Don't use SSE4.2 if the length |
|
- of A > 16. */ |
|
- if (a[16] != 0) |
|
- return __strspn_sse2 (s, a); |
|
- } |
|
+ /* A is aligned. */ |
|
+ mask = _mm_loadu_si128 ((__m128i *) a); |
|
+ |
|
+ /* Find where the NULL terminator is. */ |
|
+ maskz = _mm_cmpeq_epi8 (mask, zero); |
|
+ maskz_bits = _mm_movemask_epi8 (maskz); |
|
+ if (maskz_bits == 0) |
|
+ { |
|
+ /* There is no NULL terminator. Don't use SSE4.2 if the length |
|
+ of A > 16. */ |
|
+ if (a[16] != 0) |
|
+ return __strspn_sse2 (s, a); |
|
} |
|
+ aligned = s; |
|
+ offset = (unsigned int) ((size_t) s & 15); |
|
|
|
- offset = (int) ((size_t) s & 15); |
|
if (offset != 0) |
|
{ |
|
+ start_unaligned: |
|
/* Check partial string. */ |
|
aligned = (const char *) ((size_t) s & -16L); |
|
__m128i value = _mm_load_si128 ((__m128i *) aligned); |
|
+ __m128i adj_value = __m128i_shift_right (value, offset); |
|
|
|
- value = __m128i_shift_right (value, offset); |
|
- |
|
- int length = _mm_cmpistri (mask, value, 0x12); |
|
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); |
|
/* No need to check CFlag since it is always 1. */ |
|
if (length < 16 - offset) |
|
return length; |
|
/* Find where the NULL terminator is. */ |
|
- int index = _mm_cmpistri (value, value, 0x3a); |
|
- if (index < 16 - offset) |
|
+ maskz = _mm_cmpeq_epi8 (value, zero); |
|
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; |
|
+ if (maskz_bits != 0) |
|
return length; |
|
aligned += 16; |
|
} |
|
- else |
|
- aligned = s; |
|
|
|
+start_loop: |
|
while (1) |
|
{ |
|
__m128i value = _mm_load_si128 ((__m128i *) aligned); |
|
- int index = _mm_cmpistri (mask, value, 0x12); |
|
- int cflag = _mm_cmpistrc (mask, value, 0x12); |
|
+ unsigned int index = _mm_cmpistri (mask, value, 0x12); |
|
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); |
|
if (cflag) |
|
return (size_t) (aligned + index - s); |
|
aligned += 16;
|
|
|