You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
865 lines
17 KiB
865 lines
17 KiB
commit 0a11305416e287d85c64f04337cfd64b6b350e0c |
|
Author: Noah Goldstein <goldstein.w.n@gmail.com> |
|
Date: Thu Apr 21 20:52:28 2022 -0500 |
|
|
|
x86: Optimize {str|wcs}rchr-sse2 |
|
|
|
The new code unrolls the main loop slightly without adding too much |
|
overhead and minimizes the comparisons for the search CHAR. |
|
|
|
Geometric Mean of all benchmarks New / Old: 0.741 |
|
See email for all results. |
|
|
|
Full xcheck passes on x86_64 with and without multiarch enabled. |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S |
|
index 67c30d0260cef8a3..a56300bc1830dedd 100644 |
|
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S |
|
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S |
|
@@ -17,7 +17,7 @@ |
|
<https://www.gnu.org/licenses/>. */ |
|
|
|
#if IS_IN (libc) |
|
-# define strrchr __strrchr_sse2 |
|
+# define STRRCHR __strrchr_sse2 |
|
|
|
# undef weak_alias |
|
# define weak_alias(strrchr, rindex) |
|
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S |
|
index a36034b40afe8d3d..00f69f2be77a43a0 100644 |
|
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S |
|
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S |
|
@@ -17,7 +17,6 @@ |
|
<https://www.gnu.org/licenses/>. */ |
|
|
|
#if IS_IN (libc) |
|
-# define wcsrchr __wcsrchr_sse2 |
|
+# define STRRCHR __wcsrchr_sse2 |
|
#endif |
|
- |
|
#include "../wcsrchr.S" |
|
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S |
|
index dfd09fe9508cb5bc..fc1598bb11417fd5 100644 |
|
--- a/sysdeps/x86_64/strrchr.S |
|
+++ b/sysdeps/x86_64/strrchr.S |
|
@@ -19,210 +19,360 @@ |
|
|
|
#include <sysdep.h> |
|
|
|
+#ifndef STRRCHR |
|
+# define STRRCHR strrchr |
|
+#endif |
|
+ |
|
+#ifdef USE_AS_WCSRCHR |
|
+# define PCMPEQ pcmpeqd |
|
+# define CHAR_SIZE 4 |
|
+# define PMINU pminud |
|
+#else |
|
+# define PCMPEQ pcmpeqb |
|
+# define CHAR_SIZE 1 |
|
+# define PMINU pminub |
|
+#endif |
|
+ |
|
+#define PAGE_SIZE 4096 |
|
+#define VEC_SIZE 16 |
|
+ |
|
.text |
|
-ENTRY (strrchr) |
|
- movd %esi, %xmm1 |
|
+ENTRY(STRRCHR) |
|
+ movd %esi, %xmm0 |
|
movq %rdi, %rax |
|
- andl $4095, %eax |
|
- punpcklbw %xmm1, %xmm1 |
|
- cmpq $4032, %rax |
|
- punpcklwd %xmm1, %xmm1 |
|
- pshufd $0, %xmm1, %xmm1 |
|
+ andl $(PAGE_SIZE - 1), %eax |
|
+#ifndef USE_AS_WCSRCHR |
|
+ punpcklbw %xmm0, %xmm0 |
|
+ punpcklwd %xmm0, %xmm0 |
|
+#endif |
|
+ pshufd $0, %xmm0, %xmm0 |
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
|
ja L(cross_page) |
|
- movdqu (%rdi), %xmm0 |
|
+ |
|
+L(cross_page_continue): |
|
+ movups (%rdi), %xmm1 |
|
pxor %xmm2, %xmm2 |
|
- movdqa %xmm0, %xmm3 |
|
- pcmpeqb %xmm1, %xmm0 |
|
- pcmpeqb %xmm2, %xmm3 |
|
- pmovmskb %xmm0, %ecx |
|
- pmovmskb %xmm3, %edx |
|
- testq %rdx, %rdx |
|
- je L(next_48_bytes) |
|
- leaq -1(%rdx), %rax |
|
- xorq %rdx, %rax |
|
- andq %rcx, %rax |
|
- je L(exit) |
|
- bsrq %rax, %rax |
|
+ PCMPEQ %xmm1, %xmm2 |
|
+ pmovmskb %xmm2, %ecx |
|
+ testl %ecx, %ecx |
|
+ jz L(aligned_more) |
|
+ |
|
+ PCMPEQ %xmm0, %xmm1 |
|
+ pmovmskb %xmm1, %eax |
|
+ leal -1(%rcx), %edx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(ret0) |
|
+ bsrl %eax, %eax |
|
addq %rdi, %rax |
|
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If |
|
+ search CHAR is zero we are correct. Either way `andq |
|
+ -CHAR_SIZE, %rax` gets the correct result. */ |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+L(ret0): |
|
ret |
|
|
|
+ /* Returns for first vec x1/x2 have hard coded backward search |
|
+ path for earlier matches. */ |
|
.p2align 4 |
|
-L(next_48_bytes): |
|
- movdqu 16(%rdi), %xmm4 |
|
- movdqa %xmm4, %xmm5 |
|
- movdqu 32(%rdi), %xmm3 |
|
- pcmpeqb %xmm1, %xmm4 |
|
- pcmpeqb %xmm2, %xmm5 |
|
- movdqu 48(%rdi), %xmm0 |
|
- pmovmskb %xmm5, %edx |
|
- movdqa %xmm3, %xmm5 |
|
- pcmpeqb %xmm1, %xmm3 |
|
- pcmpeqb %xmm2, %xmm5 |
|
- pcmpeqb %xmm0, %xmm2 |
|
- salq $16, %rdx |
|
- pmovmskb %xmm3, %r8d |
|
- pmovmskb %xmm5, %eax |
|
- pmovmskb %xmm2, %esi |
|
- salq $32, %r8 |
|
- salq $32, %rax |
|
- pcmpeqb %xmm1, %xmm0 |
|
- orq %rdx, %rax |
|
- movq %rsi, %rdx |
|
- pmovmskb %xmm4, %esi |
|
- salq $48, %rdx |
|
- salq $16, %rsi |
|
- orq %r8, %rsi |
|
- orq %rcx, %rsi |
|
- pmovmskb %xmm0, %ecx |
|
- salq $48, %rcx |
|
- orq %rcx, %rsi |
|
- orq %rdx, %rax |
|
- je L(loop_header2) |
|
- leaq -1(%rax), %rcx |
|
- xorq %rax, %rcx |
|
- andq %rcx, %rsi |
|
- je L(exit) |
|
- bsrq %rsi, %rsi |
|
- leaq (%rdi,%rsi), %rax |
|
+L(first_vec_x0_test): |
|
+ PCMPEQ %xmm0, %xmm1 |
|
+ pmovmskb %xmm1, %eax |
|
+ testl %eax, %eax |
|
+ jz L(ret0) |
|
+ bsrl %eax, %eax |
|
+ addq %r8, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
ret |
|
|
|
.p2align 4 |
|
-L(loop_header2): |
|
- testq %rsi, %rsi |
|
- movq %rdi, %rcx |
|
- je L(no_c_found) |
|
-L(loop_header): |
|
- addq $64, %rdi |
|
- pxor %xmm7, %xmm7 |
|
- andq $-64, %rdi |
|
- jmp L(loop_entry) |
|
+L(first_vec_x1): |
|
+ PCMPEQ %xmm0, %xmm2 |
|
+ pmovmskb %xmm2, %eax |
|
+ leal -1(%rcx), %edx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(first_vec_x0_test) |
|
+ bsrl %eax, %eax |
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+ ret |
|
|
|
.p2align 4 |
|
-L(loop64): |
|
- testq %rdx, %rdx |
|
- cmovne %rdx, %rsi |
|
- cmovne %rdi, %rcx |
|
- addq $64, %rdi |
|
-L(loop_entry): |
|
- movdqa 32(%rdi), %xmm3 |
|
- pxor %xmm6, %xmm6 |
|
- movdqa 48(%rdi), %xmm2 |
|
- movdqa %xmm3, %xmm0 |
|
- movdqa 16(%rdi), %xmm4 |
|
- pminub %xmm2, %xmm0 |
|
- movdqa (%rdi), %xmm5 |
|
- pminub %xmm4, %xmm0 |
|
- pminub %xmm5, %xmm0 |
|
- pcmpeqb %xmm7, %xmm0 |
|
- pmovmskb %xmm0, %eax |
|
- movdqa %xmm5, %xmm0 |
|
- pcmpeqb %xmm1, %xmm0 |
|
- pmovmskb %xmm0, %r9d |
|
- movdqa %xmm4, %xmm0 |
|
- pcmpeqb %xmm1, %xmm0 |
|
- pmovmskb %xmm0, %edx |
|
- movdqa %xmm3, %xmm0 |
|
- pcmpeqb %xmm1, %xmm0 |
|
- salq $16, %rdx |
|
- pmovmskb %xmm0, %r10d |
|
- movdqa %xmm2, %xmm0 |
|
- pcmpeqb %xmm1, %xmm0 |
|
- salq $32, %r10 |
|
- orq %r10, %rdx |
|
- pmovmskb %xmm0, %r8d |
|
- orq %r9, %rdx |
|
- salq $48, %r8 |
|
- orq %r8, %rdx |
|
+L(first_vec_x1_test): |
|
+ PCMPEQ %xmm0, %xmm2 |
|
+ pmovmskb %xmm2, %eax |
|
testl %eax, %eax |
|
- je L(loop64) |
|
- pcmpeqb %xmm6, %xmm4 |
|
- pcmpeqb %xmm6, %xmm3 |
|
- pcmpeqb %xmm6, %xmm5 |
|
- pmovmskb %xmm4, %eax |
|
- pmovmskb %xmm3, %r10d |
|
- pcmpeqb %xmm6, %xmm2 |
|
- pmovmskb %xmm5, %r9d |
|
- salq $32, %r10 |
|
- salq $16, %rax |
|
- pmovmskb %xmm2, %r8d |
|
- orq %r10, %rax |
|
- orq %r9, %rax |
|
- salq $48, %r8 |
|
- orq %r8, %rax |
|
- leaq -1(%rax), %r8 |
|
- xorq %rax, %r8 |
|
- andq %r8, %rdx |
|
- cmovne %rdi, %rcx |
|
- cmovne %rdx, %rsi |
|
- bsrq %rsi, %rsi |
|
- leaq (%rcx,%rsi), %rax |
|
+ jz L(first_vec_x0_test) |
|
+ bsrl %eax, %eax |
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+ ret |
|
+ |
|
+ .p2align 4 |
|
+L(first_vec_x2): |
|
+ PCMPEQ %xmm0, %xmm3 |
|
+ pmovmskb %xmm3, %eax |
|
+ leal -1(%rcx), %edx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(first_vec_x1_test) |
|
+ bsrl %eax, %eax |
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+ ret |
|
+ |
|
+ .p2align 4 |
|
+L(aligned_more): |
|
+ /* Save original pointer if match was in VEC 0. */ |
|
+ movq %rdi, %r8 |
|
+ andq $-VEC_SIZE, %rdi |
|
+ |
|
+ movaps VEC_SIZE(%rdi), %xmm2 |
|
+ pxor %xmm3, %xmm3 |
|
+ PCMPEQ %xmm2, %xmm3 |
|
+ pmovmskb %xmm3, %ecx |
|
+ testl %ecx, %ecx |
|
+ jnz L(first_vec_x1) |
|
+ |
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3 |
|
+ pxor %xmm4, %xmm4 |
|
+ PCMPEQ %xmm3, %xmm4 |
|
+ pmovmskb %xmm4, %ecx |
|
+ testl %ecx, %ecx |
|
+ jnz L(first_vec_x2) |
|
+ |
|
+ addq $VEC_SIZE, %rdi |
|
+ /* Save pointer again before realigning. */ |
|
+ movq %rdi, %rsi |
|
+ andq $-(VEC_SIZE * 2), %rdi |
|
+ .p2align 4 |
|
+L(first_loop): |
|
+ /* Do 2x VEC at a time. */ |
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4 |
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5 |
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for |
|
+ detecting zero. Note if this is found to be a bottleneck it |
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */ |
|
+#ifdef USE_AS_WCSRCHR |
|
+ movaps %xmm5, %xmm6 |
|
+ pxor %xmm8, %xmm8 |
|
+ |
|
+ PCMPEQ %xmm8, %xmm5 |
|
+ PCMPEQ %xmm4, %xmm8 |
|
+ por %xmm5, %xmm8 |
|
+#else |
|
+ movaps %xmm5, %xmm6 |
|
+ PMINU %xmm4, %xmm5 |
|
+#endif |
|
+ |
|
+ movaps %xmm4, %xmm9 |
|
+ PCMPEQ %xmm0, %xmm4 |
|
+ PCMPEQ %xmm0, %xmm6 |
|
+ movaps %xmm6, %xmm7 |
|
+ por %xmm4, %xmm6 |
|
+#ifndef USE_AS_WCSRCHR |
|
+ pxor %xmm8, %xmm8 |
|
+ PCMPEQ %xmm5, %xmm8 |
|
+#endif |
|
+ pmovmskb %xmm8, %ecx |
|
+ pmovmskb %xmm6, %eax |
|
+ |
|
+ addq $(VEC_SIZE * 2), %rdi |
|
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can |
|
+ macro-fuse with `jz`. */ |
|
+ addl %ecx, %eax |
|
+ jz L(first_loop) |
|
+ |
|
+ /* Check if there is zero match. */ |
|
+ testl %ecx, %ecx |
|
+ jz L(second_loop_match) |
|
+ |
|
+ /* Check if there was a match in last iteration. */ |
|
+ subl %ecx, %eax |
|
+ jnz L(new_match) |
|
+ |
|
+L(first_loop_old_match): |
|
+ PCMPEQ %xmm0, %xmm2 |
|
+ PCMPEQ %xmm0, %xmm3 |
|
+ pmovmskb %xmm2, %ecx |
|
+ pmovmskb %xmm3, %eax |
|
+ addl %eax, %ecx |
|
+ jz L(first_vec_x0_test) |
|
+ /* NB: We could move this shift to before the branch and save a |
|
+ bit of code size / performance on the fall through. The |
|
+ branch leads to the null case which generally seems hotter |
|
+ than char in first 3x VEC. */ |
|
+ sall $16, %eax |
|
+ orl %ecx, %eax |
|
+ |
|
+ bsrl %eax, %eax |
|
+ addq %rsi, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+ ret |
|
+ |
|
+ .p2align 4 |
|
+L(new_match): |
|
+ pxor %xmm6, %xmm6 |
|
+ PCMPEQ %xmm9, %xmm6 |
|
+ pmovmskb %xmm6, %eax |
|
+ sall $16, %ecx |
|
+ orl %eax, %ecx |
|
+ |
|
+ /* We can't reuse either of the old comparisons as since we mask |
|
+ of zeros after first zero (instead of using the full |
|
+ comparison) we can't gurantee no interference between match |
|
+ after end of string and valid match. */ |
|
+ pmovmskb %xmm4, %eax |
|
+ pmovmskb %xmm7, %edx |
|
+ sall $16, %edx |
|
+ orl %edx, %eax |
|
+ |
|
+ leal -1(%ecx), %edx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(first_loop_old_match) |
|
+ bsrl %eax, %eax |
|
+ addq %rdi, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
ret |
|
|
|
+ /* Save minimum state for getting most recent match. We can |
|
+ throw out all previous work. */ |
|
.p2align 4 |
|
-L(no_c_found): |
|
- movl $1, %esi |
|
- xorl %ecx, %ecx |
|
- jmp L(loop_header) |
|
+L(second_loop_match): |
|
+ movq %rdi, %rsi |
|
+ movaps %xmm4, %xmm2 |
|
+ movaps %xmm7, %xmm3 |
|
|
|
.p2align 4 |
|
-L(exit): |
|
- xorl %eax, %eax |
|
+L(second_loop): |
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4 |
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5 |
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for |
|
+ detecting zero. Note if this is found to be a bottleneck it |
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */ |
|
+#ifdef USE_AS_WCSRCHR |
|
+ movaps %xmm5, %xmm6 |
|
+ pxor %xmm8, %xmm8 |
|
+ |
|
+ PCMPEQ %xmm8, %xmm5 |
|
+ PCMPEQ %xmm4, %xmm8 |
|
+ por %xmm5, %xmm8 |
|
+#else |
|
+ movaps %xmm5, %xmm6 |
|
+ PMINU %xmm4, %xmm5 |
|
+#endif |
|
+ |
|
+ movaps %xmm4, %xmm9 |
|
+ PCMPEQ %xmm0, %xmm4 |
|
+ PCMPEQ %xmm0, %xmm6 |
|
+ movaps %xmm6, %xmm7 |
|
+ por %xmm4, %xmm6 |
|
+#ifndef USE_AS_WCSRCHR |
|
+ pxor %xmm8, %xmm8 |
|
+ PCMPEQ %xmm5, %xmm8 |
|
+#endif |
|
+ |
|
+ pmovmskb %xmm8, %ecx |
|
+ pmovmskb %xmm6, %eax |
|
+ |
|
+ addq $(VEC_SIZE * 2), %rdi |
|
+ /* Either null term or new occurence of CHAR. */ |
|
+ addl %ecx, %eax |
|
+ jz L(second_loop) |
|
+ |
|
+ /* No null term so much be new occurence of CHAR. */ |
|
+ testl %ecx, %ecx |
|
+ jz L(second_loop_match) |
|
+ |
|
+ |
|
+ subl %ecx, %eax |
|
+ jnz L(second_loop_new_match) |
|
+ |
|
+L(second_loop_old_match): |
|
+ pmovmskb %xmm2, %ecx |
|
+ pmovmskb %xmm3, %eax |
|
+ sall $16, %eax |
|
+ orl %ecx, %eax |
|
+ bsrl %eax, %eax |
|
+ addq %rsi, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
ret |
|
|
|
.p2align 4 |
|
+L(second_loop_new_match): |
|
+ pxor %xmm6, %xmm6 |
|
+ PCMPEQ %xmm9, %xmm6 |
|
+ pmovmskb %xmm6, %eax |
|
+ sall $16, %ecx |
|
+ orl %eax, %ecx |
|
+ |
|
+ /* We can't reuse either of the old comparisons as since we mask |
|
+ of zeros after first zero (instead of using the full |
|
+ comparison) we can't gurantee no interference between match |
|
+ after end of string and valid match. */ |
|
+ pmovmskb %xmm4, %eax |
|
+ pmovmskb %xmm7, %edx |
|
+ sall $16, %edx |
|
+ orl %edx, %eax |
|
+ |
|
+ leal -1(%ecx), %edx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(second_loop_old_match) |
|
+ bsrl %eax, %eax |
|
+ addq %rdi, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+ ret |
|
+ |
|
+ .p2align 4,, 4 |
|
L(cross_page): |
|
- movq %rdi, %rax |
|
- pxor %xmm0, %xmm0 |
|
- andq $-64, %rax |
|
- movdqu (%rax), %xmm5 |
|
- movdqa %xmm5, %xmm6 |
|
- movdqu 16(%rax), %xmm4 |
|
- pcmpeqb %xmm1, %xmm5 |
|
- pcmpeqb %xmm0, %xmm6 |
|
- movdqu 32(%rax), %xmm3 |
|
- pmovmskb %xmm6, %esi |
|
- movdqa %xmm4, %xmm6 |
|
- movdqu 48(%rax), %xmm2 |
|
- pcmpeqb %xmm1, %xmm4 |
|
- pcmpeqb %xmm0, %xmm6 |
|
- pmovmskb %xmm6, %edx |
|
- movdqa %xmm3, %xmm6 |
|
- pcmpeqb %xmm1, %xmm3 |
|
- pcmpeqb %xmm0, %xmm6 |
|
- pcmpeqb %xmm2, %xmm0 |
|
- salq $16, %rdx |
|
- pmovmskb %xmm3, %r9d |
|
- pmovmskb %xmm6, %r8d |
|
- pmovmskb %xmm0, %ecx |
|
- salq $32, %r9 |
|
- salq $32, %r8 |
|
- pcmpeqb %xmm1, %xmm2 |
|
- orq %r8, %rdx |
|
- salq $48, %rcx |
|
- pmovmskb %xmm5, %r8d |
|
- orq %rsi, %rdx |
|
- pmovmskb %xmm4, %esi |
|
- orq %rcx, %rdx |
|
- pmovmskb %xmm2, %ecx |
|
- salq $16, %rsi |
|
- salq $48, %rcx |
|
- orq %r9, %rsi |
|
- orq %r8, %rsi |
|
- orq %rcx, %rsi |
|
+ movq %rdi, %rsi |
|
+ andq $-VEC_SIZE, %rsi |
|
+ movaps (%rsi), %xmm1 |
|
+ pxor %xmm2, %xmm2 |
|
+ PCMPEQ %xmm1, %xmm2 |
|
+ pmovmskb %xmm2, %edx |
|
movl %edi, %ecx |
|
- subl %eax, %ecx |
|
- shrq %cl, %rdx |
|
- shrq %cl, %rsi |
|
- testq %rdx, %rdx |
|
- je L(loop_header2) |
|
- leaq -1(%rdx), %rax |
|
- xorq %rdx, %rax |
|
- andq %rax, %rsi |
|
- je L(exit) |
|
- bsrq %rsi, %rax |
|
+ andl $(VEC_SIZE - 1), %ecx |
|
+ sarl %cl, %edx |
|
+ jz L(cross_page_continue) |
|
+ PCMPEQ %xmm0, %xmm1 |
|
+ pmovmskb %xmm1, %eax |
|
+ sarl %cl, %eax |
|
+ leal -1(%rdx), %ecx |
|
+ xorl %edx, %ecx |
|
+ andl %ecx, %eax |
|
+ jz L(ret1) |
|
+ bsrl %eax, %eax |
|
addq %rdi, %rax |
|
+#ifdef USE_AS_WCSRCHR |
|
+ andq $-CHAR_SIZE, %rax |
|
+#endif |
|
+L(ret1): |
|
ret |
|
-END (strrchr) |
|
+END(STRRCHR) |
|
|
|
-weak_alias (strrchr, rindex) |
|
-libc_hidden_builtin_def (strrchr) |
|
+#ifndef USE_AS_WCSRCHR |
|
+ weak_alias (STRRCHR, rindex) |
|
+ libc_hidden_builtin_def (STRRCHR) |
|
+#endif |
|
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S |
|
index 6b318d3f29de9a9e..9006f2220963d76c 100644 |
|
--- a/sysdeps/x86_64/wcsrchr.S |
|
+++ b/sysdeps/x86_64/wcsrchr.S |
|
@@ -17,266 +17,12 @@ |
|
License along with the GNU C Library; if not, see |
|
<https://www.gnu.org/licenses/>. */ |
|
|
|
-#include <sysdep.h> |
|
|
|
- .text |
|
-ENTRY (wcsrchr) |
|
+#define USE_AS_WCSRCHR 1 |
|
+#define NO_PMINU 1 |
|
|
|
- movd %rsi, %xmm1 |
|
- mov %rdi, %rcx |
|
- punpckldq %xmm1, %xmm1 |
|
- pxor %xmm2, %xmm2 |
|
- punpckldq %xmm1, %xmm1 |
|
- and $63, %rcx |
|
- cmp $48, %rcx |
|
- ja L(crosscache) |
|
+#ifndef STRRCHR |
|
+# define STRRCHR wcsrchr |
|
+#endif |
|
|
|
- movdqu (%rdi), %xmm0 |
|
- pcmpeqd %xmm0, %xmm2 |
|
- pcmpeqd %xmm1, %xmm0 |
|
- pmovmskb %xmm2, %rcx |
|
- pmovmskb %xmm0, %rax |
|
- add $16, %rdi |
|
- |
|
- test %rax, %rax |
|
- jnz L(unaligned_match1) |
|
- |
|
- test %rcx, %rcx |
|
- jnz L(return_null) |
|
- |
|
- and $-16, %rdi |
|
- xor %r8, %r8 |
|
- jmp L(loop) |
|
- |
|
- .p2align 4 |
|
-L(unaligned_match1): |
|
- test %rcx, %rcx |
|
- jnz L(prolog_find_zero_1) |
|
- |
|
- mov %rax, %r8 |
|
- mov %rdi, %rsi |
|
- and $-16, %rdi |
|
- jmp L(loop) |
|
- |
|
- .p2align 4 |
|
-L(crosscache): |
|
- and $15, %rcx |
|
- and $-16, %rdi |
|
- pxor %xmm3, %xmm3 |
|
- movdqa (%rdi), %xmm0 |
|
- pcmpeqd %xmm0, %xmm3 |
|
- pcmpeqd %xmm1, %xmm0 |
|
- pmovmskb %xmm3, %rdx |
|
- pmovmskb %xmm0, %rax |
|
- shr %cl, %rdx |
|
- shr %cl, %rax |
|
- add $16, %rdi |
|
- |
|
- test %rax, %rax |
|
- jnz L(unaligned_match) |
|
- |
|
- test %rdx, %rdx |
|
- jnz L(return_null) |
|
- |
|
- xor %r8, %r8 |
|
- jmp L(loop) |
|
- |
|
- .p2align 4 |
|
-L(unaligned_match): |
|
- test %rdx, %rdx |
|
- jnz L(prolog_find_zero) |
|
- |
|
- mov %rax, %r8 |
|
- lea (%rdi, %rcx), %rsi |
|
- |
|
-/* Loop start on aligned string. */ |
|
- .p2align 4 |
|
-L(loop): |
|
- movdqa (%rdi), %xmm0 |
|
- pcmpeqd %xmm0, %xmm2 |
|
- add $16, %rdi |
|
- pcmpeqd %xmm1, %xmm0 |
|
- pmovmskb %xmm2, %rcx |
|
- pmovmskb %xmm0, %rax |
|
- or %rax, %rcx |
|
- jnz L(matches) |
|
- |
|
- movdqa (%rdi), %xmm3 |
|
- pcmpeqd %xmm3, %xmm2 |
|
- add $16, %rdi |
|
- pcmpeqd %xmm1, %xmm3 |
|
- pmovmskb %xmm2, %rcx |
|
- pmovmskb %xmm3, %rax |
|
- or %rax, %rcx |
|
- jnz L(matches) |
|
- |
|
- movdqa (%rdi), %xmm4 |
|
- pcmpeqd %xmm4, %xmm2 |
|
- add $16, %rdi |
|
- pcmpeqd %xmm1, %xmm4 |
|
- pmovmskb %xmm2, %rcx |
|
- pmovmskb %xmm4, %rax |
|
- or %rax, %rcx |
|
- jnz L(matches) |
|
- |
|
- movdqa (%rdi), %xmm5 |
|
- pcmpeqd %xmm5, %xmm2 |
|
- add $16, %rdi |
|
- pcmpeqd %xmm1, %xmm5 |
|
- pmovmskb %xmm2, %rcx |
|
- pmovmskb %xmm5, %rax |
|
- or %rax, %rcx |
|
- jz L(loop) |
|
- |
|
- .p2align 4 |
|
-L(matches): |
|
- test %rax, %rax |
|
- jnz L(match) |
|
-L(return_value): |
|
- test %r8, %r8 |
|
- jz L(return_null) |
|
- mov %r8, %rax |
|
- mov %rsi, %rdi |
|
- |
|
- test $15 << 4, %ah |
|
- jnz L(match_fourth_wchar) |
|
- test %ah, %ah |
|
- jnz L(match_third_wchar) |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(match): |
|
- pmovmskb %xmm2, %rcx |
|
- test %rcx, %rcx |
|
- jnz L(find_zero) |
|
- mov %rax, %r8 |
|
- mov %rdi, %rsi |
|
- jmp L(loop) |
|
- |
|
- .p2align 4 |
|
-L(find_zero): |
|
- test $15, %cl |
|
- jnz L(find_zero_in_first_wchar) |
|
- test %cl, %cl |
|
- jnz L(find_zero_in_second_wchar) |
|
- test $15, %ch |
|
- jnz L(find_zero_in_third_wchar) |
|
- |
|
- and $1 << 13 - 1, %rax |
|
- jz L(return_value) |
|
- |
|
- test $15 << 4, %ah |
|
- jnz L(match_fourth_wchar) |
|
- test %ah, %ah |
|
- jnz L(match_third_wchar) |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(find_zero_in_first_wchar): |
|
- test $1, %rax |
|
- jz L(return_value) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(find_zero_in_second_wchar): |
|
- and $1 << 5 - 1, %rax |
|
- jz L(return_value) |
|
- |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(find_zero_in_third_wchar): |
|
- and $1 << 9 - 1, %rax |
|
- jz L(return_value) |
|
- |
|
- test %ah, %ah |
|
- jnz L(match_third_wchar) |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(prolog_find_zero): |
|
- add %rcx, %rdi |
|
- mov %rdx, %rcx |
|
-L(prolog_find_zero_1): |
|
- test $15, %cl |
|
- jnz L(prolog_find_zero_in_first_wchar) |
|
- test %cl, %cl |
|
- jnz L(prolog_find_zero_in_second_wchar) |
|
- test $15, %ch |
|
- jnz L(prolog_find_zero_in_third_wchar) |
|
- |
|
- and $1 << 13 - 1, %rax |
|
- jz L(return_null) |
|
- |
|
- test $15 << 4, %ah |
|
- jnz L(match_fourth_wchar) |
|
- test %ah, %ah |
|
- jnz L(match_third_wchar) |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(prolog_find_zero_in_first_wchar): |
|
- test $1, %rax |
|
- jz L(return_null) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(prolog_find_zero_in_second_wchar): |
|
- and $1 << 5 - 1, %rax |
|
- jz L(return_null) |
|
- |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(prolog_find_zero_in_third_wchar): |
|
- and $1 << 9 - 1, %rax |
|
- jz L(return_null) |
|
- |
|
- test %ah, %ah |
|
- jnz L(match_third_wchar) |
|
- test $15 << 4, %al |
|
- jnz L(match_second_wchar) |
|
- lea -16(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(match_second_wchar): |
|
- lea -12(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(match_third_wchar): |
|
- lea -8(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(match_fourth_wchar): |
|
- lea -4(%rdi), %rax |
|
- ret |
|
- |
|
- .p2align 4 |
|
-L(return_null): |
|
- xor %rax, %rax |
|
- ret |
|
- |
|
-END (wcsrchr) |
|
+#include "../strrchr.S"
|
|
|