commit 0ae1006967eef11909fbed0f6ecef2f260b133d3 Author: Noah Goldstein Date: Wed Mar 23 16:57:22 2022 -0500 x86: Optimize strcspn and strpbrk in strcspn-c.c Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of _mm_cmpistri. Also change offset to unsigned to avoid unnecessary sign extensions. geometric_mean(N=20) of all benchmarks that dont fallback on sse2/strlen; New / Original: .928 All string/memory tests pass. Reviewed-by: H.J. Lu (cherry picked from commit 30d627d477d7255345a4b713cf352ac32d644d61) diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index c56ddbd22f014653..2436b6dcd90d8efe 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a) RETURN (NULL, strlen (s)); const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (unsigned int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); } - offset = (int) ((size_t) s & 15); + aligned = s; + offset = (unsigned int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); value = __m128i_shift_right (value, offset); - int length = _mm_cmpistri (mask, value, 0x2); + unsigned int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); if (cflag) RETURN ((char *) (s + length), length); /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); + unsigned int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) RETURN (NULL, index); aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); + unsigned int index = _mm_cmpistri (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); if (cflag) RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); if (zflag)