You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
367 lines
9.3 KiB
367 lines
9.3 KiB
commit 3c55c207564c0ae30d78d01689b4ae16bf38dd63 |
|
Author: Noah Goldstein <goldstein.w.n@gmail.com> |
|
Date: Wed Mar 23 16:57:16 2022 -0500 |
|
|
|
x86: Code cleanup in strchr-avx2 and comment justifying branch |
|
|
|
Small code cleanup for size: -53 bytes. |
|
|
|
Add comment justifying using a branch to do NULL/non-null return. |
|
|
|
All string/memory tests pass and no regressions in benchtests. |
|
|
|
geometric_mean(N=20) of all benchmarks Original / New: 1.00 |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit a6fbf4d51e9ba8063c4f8331564892ead9c67344) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S |
|
index 413942b96a835c4a..ef4ce0f3677e30c8 100644 |
|
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S |
|
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S |
|
@@ -48,13 +48,13 @@ |
|
# define PAGE_SIZE 4096 |
|
|
|
.section SECTION(.text),"ax",@progbits |
|
-ENTRY (STRCHR) |
|
+ENTRY_P2ALIGN (STRCHR, 5) |
|
/* Broadcast CHAR to YMM0. */ |
|
vmovd %esi, %xmm0 |
|
movl %edi, %eax |
|
andl $(PAGE_SIZE - 1), %eax |
|
VPBROADCAST %xmm0, %ymm0 |
|
- vpxor %xmm9, %xmm9, %xmm9 |
|
+ vpxor %xmm1, %xmm1, %xmm1 |
|
|
|
/* Check if we cross page boundary with one vector load. */ |
|
cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
|
@@ -62,37 +62,29 @@ ENTRY (STRCHR) |
|
|
|
/* Check the first VEC_SIZE bytes. Search for both CHAR and the |
|
null byte. */ |
|
- vmovdqu (%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqu (%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jz L(aligned_more) |
|
tzcntl %eax, %eax |
|
# ifndef USE_AS_STRCHRNUL |
|
- /* Found CHAR or the null byte. */ |
|
- cmp (%rdi, %rax), %CHAR_REG |
|
- jne L(zero) |
|
-# endif |
|
- addq %rdi, %rax |
|
- VZEROUPPER_RETURN |
|
- |
|
- /* .p2align 5 helps keep performance more consistent if ENTRY() |
|
- alignment % 32 was either 16 or 0. As well this makes the |
|
- alignment % 32 of the loop_4x_vec fixed which makes tuning it |
|
- easier. */ |
|
- .p2align 5 |
|
-L(first_vec_x4): |
|
- tzcntl %eax, %eax |
|
- addq $(VEC_SIZE * 3 + 1), %rdi |
|
-# ifndef USE_AS_STRCHRNUL |
|
- /* Found CHAR or the null byte. */ |
|
+ /* Found CHAR or the null byte. */ |
|
cmp (%rdi, %rax), %CHAR_REG |
|
+ /* NB: Use a branch instead of cmovcc here. The expectation is |
|
+ that with strchr the user will branch based on input being |
|
+ null. Since this branch will be 100% predictive of the user |
|
+ branch a branch miss here should save what otherwise would |
|
+ be branch miss in the user code. Otherwise using a branch 1) |
|
+ saves code size and 2) is faster in highly predictable |
|
+ environments. */ |
|
jne L(zero) |
|
# endif |
|
addq %rdi, %rax |
|
- VZEROUPPER_RETURN |
|
+L(return_vzeroupper): |
|
+ ZERO_UPPER_VEC_REGISTERS_RETURN |
|
|
|
# ifndef USE_AS_STRCHRNUL |
|
L(zero): |
|
@@ -103,7 +95,8 @@ L(zero): |
|
|
|
.p2align 4 |
|
L(first_vec_x1): |
|
- tzcntl %eax, %eax |
|
+ /* Use bsf to save code size. */ |
|
+ bsfl %eax, %eax |
|
incq %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
@@ -113,9 +106,10 @@ L(first_vec_x1): |
|
addq %rdi, %rax |
|
VZEROUPPER_RETURN |
|
|
|
- .p2align 4 |
|
+ .p2align 4,, 10 |
|
L(first_vec_x2): |
|
- tzcntl %eax, %eax |
|
+ /* Use bsf to save code size. */ |
|
+ bsfl %eax, %eax |
|
addq $(VEC_SIZE + 1), %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
@@ -125,9 +119,10 @@ L(first_vec_x2): |
|
addq %rdi, %rax |
|
VZEROUPPER_RETURN |
|
|
|
- .p2align 4 |
|
+ .p2align 4,, 8 |
|
L(first_vec_x3): |
|
- tzcntl %eax, %eax |
|
+ /* Use bsf to save code size. */ |
|
+ bsfl %eax, %eax |
|
addq $(VEC_SIZE * 2 + 1), %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
@@ -137,6 +132,21 @@ L(first_vec_x3): |
|
addq %rdi, %rax |
|
VZEROUPPER_RETURN |
|
|
|
+ .p2align 4,, 10 |
|
+L(first_vec_x4): |
|
+ /* Use bsf to save code size. */ |
|
+ bsfl %eax, %eax |
|
+ addq $(VEC_SIZE * 3 + 1), %rdi |
|
+# ifndef USE_AS_STRCHRNUL |
|
+ /* Found CHAR or the null byte. */ |
|
+ cmp (%rdi, %rax), %CHAR_REG |
|
+ jne L(zero) |
|
+# endif |
|
+ addq %rdi, %rax |
|
+ VZEROUPPER_RETURN |
|
+ |
|
+ |
|
+ |
|
.p2align 4 |
|
L(aligned_more): |
|
/* Align data to VEC_SIZE - 1. This is the same number of |
|
@@ -146,90 +156,92 @@ L(aligned_more): |
|
L(cross_page_continue): |
|
/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time |
|
since data is only aligned to VEC_SIZE. */ |
|
- vmovdqa 1(%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqa 1(%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jnz L(first_vec_x1) |
|
|
|
- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jnz L(first_vec_x2) |
|
|
|
- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jnz L(first_vec_x3) |
|
|
|
- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jnz L(first_vec_x4) |
|
- /* Align data to VEC_SIZE * 4 - 1. */ |
|
- addq $(VEC_SIZE * 4 + 1), %rdi |
|
- andq $-(VEC_SIZE * 4), %rdi |
|
+ /* Align data to VEC_SIZE * 4 - 1. */ |
|
+ incq %rdi |
|
+ orq $(VEC_SIZE * 4 - 1), %rdi |
|
.p2align 4 |
|
L(loop_4x_vec): |
|
/* Compare 4 * VEC at a time forward. */ |
|
- vmovdqa (%rdi), %ymm5 |
|
- vmovdqa (VEC_SIZE)(%rdi), %ymm6 |
|
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 |
|
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 |
|
+ vmovdqa 1(%rdi), %ymm6 |
|
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 |
|
|
|
/* Leaves only CHARS matching esi as 0. */ |
|
- vpxor %ymm5, %ymm0, %ymm1 |
|
vpxor %ymm6, %ymm0, %ymm2 |
|
vpxor %ymm7, %ymm0, %ymm3 |
|
- vpxor %ymm8, %ymm0, %ymm4 |
|
|
|
- VPMINU %ymm1, %ymm5, %ymm1 |
|
VPMINU %ymm2, %ymm6, %ymm2 |
|
VPMINU %ymm3, %ymm7, %ymm3 |
|
- VPMINU %ymm4, %ymm8, %ymm4 |
|
|
|
- VPMINU %ymm1, %ymm2, %ymm5 |
|
- VPMINU %ymm3, %ymm4, %ymm6 |
|
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 |
|
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 |
|
+ |
|
+ vpxor %ymm6, %ymm0, %ymm4 |
|
+ vpxor %ymm7, %ymm0, %ymm5 |
|
+ |
|
+ VPMINU %ymm4, %ymm6, %ymm4 |
|
+ VPMINU %ymm5, %ymm7, %ymm5 |
|
|
|
- VPMINU %ymm5, %ymm6, %ymm6 |
|
+ VPMINU %ymm2, %ymm3, %ymm6 |
|
+ VPMINU %ymm4, %ymm5, %ymm7 |
|
|
|
- VPCMPEQ %ymm6, %ymm9, %ymm6 |
|
- vpmovmskb %ymm6, %ecx |
|
+ VPMINU %ymm6, %ymm7, %ymm7 |
|
+ |
|
+ VPCMPEQ %ymm7, %ymm1, %ymm7 |
|
+ vpmovmskb %ymm7, %ecx |
|
subq $-(VEC_SIZE * 4), %rdi |
|
testl %ecx, %ecx |
|
jz L(loop_4x_vec) |
|
|
|
- |
|
- VPCMPEQ %ymm1, %ymm9, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpmovmskb %ymm2, %eax |
|
testl %eax, %eax |
|
jnz L(last_vec_x0) |
|
|
|
|
|
- VPCMPEQ %ymm5, %ymm9, %ymm2 |
|
- vpmovmskb %ymm2, %eax |
|
+ VPCMPEQ %ymm3, %ymm1, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
testl %eax, %eax |
|
jnz L(last_vec_x1) |
|
|
|
- VPCMPEQ %ymm3, %ymm9, %ymm3 |
|
- vpmovmskb %ymm3, %eax |
|
+ VPCMPEQ %ymm4, %ymm1, %ymm4 |
|
+ vpmovmskb %ymm4, %eax |
|
/* rcx has combined result from all 4 VEC. It will only be used |
|
if the first 3 other VEC all did not contain a match. */ |
|
salq $32, %rcx |
|
orq %rcx, %rax |
|
tzcntq %rax, %rax |
|
- subq $(VEC_SIZE * 2), %rdi |
|
+ subq $(VEC_SIZE * 2 - 1), %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
cmp (%rdi, %rax), %CHAR_REG |
|
@@ -239,10 +251,11 @@ L(loop_4x_vec): |
|
VZEROUPPER_RETURN |
|
|
|
|
|
- .p2align 4 |
|
+ .p2align 4,, 10 |
|
L(last_vec_x0): |
|
- tzcntl %eax, %eax |
|
- addq $-(VEC_SIZE * 4), %rdi |
|
+ /* Use bsf to save code size. */ |
|
+ bsfl %eax, %eax |
|
+ addq $-(VEC_SIZE * 4 - 1), %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
cmp (%rdi, %rax), %CHAR_REG |
|
@@ -251,16 +264,11 @@ L(last_vec_x0): |
|
addq %rdi, %rax |
|
VZEROUPPER_RETURN |
|
|
|
-# ifndef USE_AS_STRCHRNUL |
|
-L(zero_end): |
|
- xorl %eax, %eax |
|
- VZEROUPPER_RETURN |
|
-# endif |
|
|
|
- .p2align 4 |
|
+ .p2align 4,, 10 |
|
L(last_vec_x1): |
|
tzcntl %eax, %eax |
|
- subq $(VEC_SIZE * 3), %rdi |
|
+ subq $(VEC_SIZE * 3 - 1), %rdi |
|
# ifndef USE_AS_STRCHRNUL |
|
/* Found CHAR or the null byte. */ |
|
cmp (%rdi, %rax), %CHAR_REG |
|
@@ -269,18 +277,23 @@ L(last_vec_x1): |
|
addq %rdi, %rax |
|
VZEROUPPER_RETURN |
|
|
|
+# ifndef USE_AS_STRCHRNUL |
|
+L(zero_end): |
|
+ xorl %eax, %eax |
|
+ VZEROUPPER_RETURN |
|
+# endif |
|
|
|
/* Cold case for crossing page with first load. */ |
|
- .p2align 4 |
|
+ .p2align 4,, 8 |
|
L(cross_page_boundary): |
|
movq %rdi, %rdx |
|
/* Align rdi to VEC_SIZE - 1. */ |
|
orq $(VEC_SIZE - 1), %rdi |
|
- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 |
|
- VPCMPEQ %ymm8, %ymm0, %ymm1 |
|
- VPCMPEQ %ymm8, %ymm9, %ymm2 |
|
- vpor %ymm1, %ymm2, %ymm1 |
|
- vpmovmskb %ymm1, %eax |
|
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 |
|
+ VPCMPEQ %ymm2, %ymm0, %ymm3 |
|
+ VPCMPEQ %ymm2, %ymm1, %ymm2 |
|
+ vpor %ymm3, %ymm2, %ymm3 |
|
+ vpmovmskb %ymm3, %eax |
|
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT |
|
so no need to manually mod edx. */ |
|
sarxl %edx, %eax, %eax |
|
@@ -291,13 +304,10 @@ L(cross_page_boundary): |
|
xorl %ecx, %ecx |
|
/* Found CHAR or the null byte. */ |
|
cmp (%rdx, %rax), %CHAR_REG |
|
- leaq (%rdx, %rax), %rax |
|
- cmovne %rcx, %rax |
|
-# else |
|
- addq %rdx, %rax |
|
+ jne L(zero_end) |
|
# endif |
|
-L(return_vzeroupper): |
|
- ZERO_UPPER_VEC_REGISTERS_RETURN |
|
+ addq %rdx, %rax |
|
+ VZEROUPPER_RETURN |
|
|
|
END (STRCHR) |
|
-# endif |
|
+#endif
|
|
|