You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
259 lines
8.3 KiB
259 lines
8.3 KiB
commit df5de87260dba479873b2850bbe5c0b81c2376f6 |
|
Author: Noah Goldstein <goldstein.w.n@gmail.com> |
|
Date: Fri Apr 15 12:28:01 2022 -0500 |
|
|
|
x86: Cleanup page cross code in memcmp-avx2-movbe.S |
|
|
|
Old code was both inefficient and wasted code size. New code (-62 |
|
bytes) and comparable or better performance in the page cross case. |
|
|
|
geometric_mean(N=20) of page cross cases New / Original: 0.960 |
|
|
|
size, align0, align1, ret, New Time/Old Time |
|
1, 4095, 0, 0, 1.001 |
|
1, 4095, 0, 1, 0.999 |
|
1, 4095, 0, -1, 1.0 |
|
2, 4094, 0, 0, 1.0 |
|
2, 4094, 0, 1, 1.0 |
|
2, 4094, 0, -1, 1.0 |
|
3, 4093, 0, 0, 1.0 |
|
3, 4093, 0, 1, 1.0 |
|
3, 4093, 0, -1, 1.0 |
|
4, 4092, 0, 0, 0.987 |
|
4, 4092, 0, 1, 1.0 |
|
4, 4092, 0, -1, 1.0 |
|
5, 4091, 0, 0, 0.984 |
|
5, 4091, 0, 1, 1.002 |
|
5, 4091, 0, -1, 1.005 |
|
6, 4090, 0, 0, 0.993 |
|
6, 4090, 0, 1, 1.001 |
|
6, 4090, 0, -1, 1.003 |
|
7, 4089, 0, 0, 0.991 |
|
7, 4089, 0, 1, 1.0 |
|
7, 4089, 0, -1, 1.001 |
|
8, 4088, 0, 0, 0.875 |
|
8, 4088, 0, 1, 0.881 |
|
8, 4088, 0, -1, 0.888 |
|
9, 4087, 0, 0, 0.872 |
|
9, 4087, 0, 1, 0.879 |
|
9, 4087, 0, -1, 0.883 |
|
10, 4086, 0, 0, 0.878 |
|
10, 4086, 0, 1, 0.886 |
|
10, 4086, 0, -1, 0.873 |
|
11, 4085, 0, 0, 0.878 |
|
11, 4085, 0, 1, 0.881 |
|
11, 4085, 0, -1, 0.879 |
|
12, 4084, 0, 0, 0.873 |
|
12, 4084, 0, 1, 0.889 |
|
12, 4084, 0, -1, 0.875 |
|
13, 4083, 0, 0, 0.873 |
|
13, 4083, 0, 1, 0.863 |
|
13, 4083, 0, -1, 0.863 |
|
14, 4082, 0, 0, 0.838 |
|
14, 4082, 0, 1, 0.869 |
|
14, 4082, 0, -1, 0.877 |
|
15, 4081, 0, 0, 0.841 |
|
15, 4081, 0, 1, 0.869 |
|
15, 4081, 0, -1, 0.876 |
|
16, 4080, 0, 0, 0.988 |
|
16, 4080, 0, 1, 0.99 |
|
16, 4080, 0, -1, 0.989 |
|
17, 4079, 0, 0, 0.978 |
|
17, 4079, 0, 1, 0.981 |
|
17, 4079, 0, -1, 0.98 |
|
18, 4078, 0, 0, 0.981 |
|
18, 4078, 0, 1, 0.98 |
|
18, 4078, 0, -1, 0.985 |
|
19, 4077, 0, 0, 0.977 |
|
19, 4077, 0, 1, 0.979 |
|
19, 4077, 0, -1, 0.986 |
|
20, 4076, 0, 0, 0.977 |
|
20, 4076, 0, 1, 0.986 |
|
20, 4076, 0, -1, 0.984 |
|
21, 4075, 0, 0, 0.977 |
|
21, 4075, 0, 1, 0.983 |
|
21, 4075, 0, -1, 0.988 |
|
22, 4074, 0, 0, 0.983 |
|
22, 4074, 0, 1, 0.994 |
|
22, 4074, 0, -1, 0.993 |
|
23, 4073, 0, 0, 0.98 |
|
23, 4073, 0, 1, 0.992 |
|
23, 4073, 0, -1, 0.995 |
|
24, 4072, 0, 0, 0.989 |
|
24, 4072, 0, 1, 0.989 |
|
24, 4072, 0, -1, 0.991 |
|
25, 4071, 0, 0, 0.99 |
|
25, 4071, 0, 1, 0.999 |
|
25, 4071, 0, -1, 0.996 |
|
26, 4070, 0, 0, 0.993 |
|
26, 4070, 0, 1, 0.995 |
|
26, 4070, 0, -1, 0.998 |
|
27, 4069, 0, 0, 0.993 |
|
27, 4069, 0, 1, 0.999 |
|
27, 4069, 0, -1, 1.0 |
|
28, 4068, 0, 0, 0.997 |
|
28, 4068, 0, 1, 1.0 |
|
28, 4068, 0, -1, 0.999 |
|
29, 4067, 0, 0, 0.996 |
|
29, 4067, 0, 1, 0.999 |
|
29, 4067, 0, -1, 0.999 |
|
30, 4066, 0, 0, 0.991 |
|
30, 4066, 0, 1, 1.001 |
|
30, 4066, 0, -1, 0.999 |
|
31, 4065, 0, 0, 0.988 |
|
31, 4065, 0, 1, 0.998 |
|
31, 4065, 0, -1, 0.998 |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f) |
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |
|
index 2621ec907aedb781..ec9cf0852edf216d 100644 |
|
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |
|
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |
|
@@ -429,22 +429,21 @@ L(page_cross_less_vec): |
|
# ifndef USE_AS_WMEMCMP |
|
cmpl $8, %edx |
|
jae L(between_8_15) |
|
+ /* Fall through for [4, 7]. */ |
|
cmpl $4, %edx |
|
- jae L(between_4_7) |
|
+ jb L(between_2_3) |
|
|
|
- /* Load as big endian to avoid branches. */ |
|
- movzwl (%rdi), %eax |
|
- movzwl (%rsi), %ecx |
|
- shll $8, %eax |
|
- shll $8, %ecx |
|
- bswap %eax |
|
- bswap %ecx |
|
- movzbl -1(%rdi, %rdx), %edi |
|
- movzbl -1(%rsi, %rdx), %esi |
|
- orl %edi, %eax |
|
- orl %esi, %ecx |
|
- /* Subtraction is okay because the upper 8 bits are zero. */ |
|
- subl %ecx, %eax |
|
+ movbe (%rdi), %eax |
|
+ movbe (%rsi), %ecx |
|
+ shlq $32, %rax |
|
+ shlq $32, %rcx |
|
+ movbe -4(%rdi, %rdx), %edi |
|
+ movbe -4(%rsi, %rdx), %esi |
|
+ orq %rdi, %rax |
|
+ orq %rsi, %rcx |
|
+ subq %rcx, %rax |
|
+ /* Fast path for return zero. */ |
|
+ jnz L(ret_nonzero) |
|
/* No ymm register was touched. */ |
|
ret |
|
|
|
@@ -457,9 +456,33 @@ L(one_or_less): |
|
/* No ymm register was touched. */ |
|
ret |
|
|
|
+ .p2align 4,, 5 |
|
+L(ret_nonzero): |
|
+ sbbl %eax, %eax |
|
+ orl $1, %eax |
|
+ /* No ymm register was touched. */ |
|
+ ret |
|
+ |
|
+ .p2align 4,, 2 |
|
+L(zero): |
|
+ xorl %eax, %eax |
|
+ /* No ymm register was touched. */ |
|
+ ret |
|
+ |
|
.p2align 4 |
|
L(between_8_15): |
|
-# endif |
|
+ movbe (%rdi), %rax |
|
+ movbe (%rsi), %rcx |
|
+ subq %rcx, %rax |
|
+ jnz L(ret_nonzero) |
|
+ movbe -8(%rdi, %rdx), %rax |
|
+ movbe -8(%rsi, %rdx), %rcx |
|
+ subq %rcx, %rax |
|
+ /* Fast path for return zero. */ |
|
+ jnz L(ret_nonzero) |
|
+ /* No ymm register was touched. */ |
|
+ ret |
|
+# else |
|
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ |
|
vmovq (%rdi), %xmm1 |
|
vmovq (%rsi), %xmm2 |
|
@@ -475,16 +498,13 @@ L(between_8_15): |
|
VPCMPEQ %xmm1, %xmm2, %xmm2 |
|
vpmovmskb %xmm2, %eax |
|
subl $0xffff, %eax |
|
+ /* Fast path for return zero. */ |
|
jnz L(return_vec_0) |
|
/* No ymm register was touched. */ |
|
ret |
|
+# endif |
|
|
|
- .p2align 4 |
|
-L(zero): |
|
- xorl %eax, %eax |
|
- ret |
|
- |
|
- .p2align 4 |
|
+ .p2align 4,, 10 |
|
L(between_16_31): |
|
/* From 16 to 31 bytes. No branch when size == 16. */ |
|
vmovdqu (%rsi), %xmm2 |
|
@@ -501,11 +521,17 @@ L(between_16_31): |
|
VPCMPEQ (%rdi), %xmm2, %xmm2 |
|
vpmovmskb %xmm2, %eax |
|
subl $0xffff, %eax |
|
+ /* Fast path for return zero. */ |
|
jnz L(return_vec_0) |
|
/* No ymm register was touched. */ |
|
ret |
|
|
|
# ifdef USE_AS_WMEMCMP |
|
+ .p2align 4,, 2 |
|
+L(zero): |
|
+ xorl %eax, %eax |
|
+ ret |
|
+ |
|
.p2align 4 |
|
L(one_or_less): |
|
jb L(zero) |
|
@@ -520,22 +546,20 @@ L(one_or_less): |
|
# else |
|
|
|
.p2align 4 |
|
-L(between_4_7): |
|
- /* Load as big endian with overlapping movbe to avoid branches. |
|
- */ |
|
- movbe (%rdi), %eax |
|
- movbe (%rsi), %ecx |
|
- shlq $32, %rax |
|
- shlq $32, %rcx |
|
- movbe -4(%rdi, %rdx), %edi |
|
- movbe -4(%rsi, %rdx), %esi |
|
- orq %rdi, %rax |
|
- orq %rsi, %rcx |
|
- subq %rcx, %rax |
|
- jz L(zero_4_7) |
|
- sbbl %eax, %eax |
|
- orl $1, %eax |
|
-L(zero_4_7): |
|
+L(between_2_3): |
|
+ /* Load as big endian to avoid branches. */ |
|
+ movzwl (%rdi), %eax |
|
+ movzwl (%rsi), %ecx |
|
+ bswap %eax |
|
+ bswap %ecx |
|
+ shrl %eax |
|
+ shrl %ecx |
|
+ movzbl -1(%rdi, %rdx), %edi |
|
+ movzbl -1(%rsi, %rdx), %esi |
|
+ orl %edi, %eax |
|
+ orl %esi, %ecx |
|
+ /* Subtraction is okay because the upper bit is zero. */ |
|
+ subl %ecx, %eax |
|
/* No ymm register was touched. */ |
|
ret |
|
# endif
|
|
|