You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
4.3 KiB
253 lines
4.3 KiB
commit 4ff6ae069b7caacd5f99088abd755717b994f660 |
|
Author: Noah Goldstein <goldstein.w.n@gmail.com> |
|
Date: Fri Mar 25 17:13:33 2022 -0500 |
|
|
|
x86: Small improvements for wcslen |
|
|
|
Just a few QOL changes. |
|
1. Prefer `add` > `lea` as it has high execution units it can run |
|
on. |
|
2. Don't break macro-fusion between `test` and `jcc` |
|
3. Reduce code size by removing gratuitous padding bytes (-90 |
|
bytes). |
|
|
|
geometric_mean(N=20) of all benchmarks New / Original: 0.959 |
|
|
|
All string/memory tests pass. |
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
|
|
|
(cherry picked from commit 244b415d386487521882debb845a040a4758cb18) |
|
|
|
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S |
|
index 61edea1d14d454c6..ad066863a44ea0a5 100644 |
|
--- a/sysdeps/x86_64/wcslen.S |
|
+++ b/sysdeps/x86_64/wcslen.S |
|
@@ -41,82 +41,82 @@ ENTRY (__wcslen) |
|
pxor %xmm0, %xmm0 |
|
|
|
lea 32(%rdi), %rax |
|
- lea 16(%rdi), %rcx |
|
+ addq $16, %rdi |
|
and $-16, %rax |
|
|
|
pcmpeqd (%rax), %xmm0 |
|
pmovmskb %xmm0, %edx |
|
pxor %xmm1, %xmm1 |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm1 |
|
pmovmskb %xmm1, %edx |
|
pxor %xmm2, %xmm2 |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm2 |
|
pmovmskb %xmm2, %edx |
|
pxor %xmm3, %xmm3 |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm0 |
|
pmovmskb %xmm0, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm1 |
|
pmovmskb %xmm1, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm2 |
|
pmovmskb %xmm2, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm0 |
|
pmovmskb %xmm0, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm1 |
|
pmovmskb %xmm1, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm2 |
|
pmovmskb %xmm2, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
pcmpeqd (%rax), %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $16, %rax |
|
test %edx, %edx |
|
- lea 16(%rax), %rax |
|
jnz L(exit) |
|
|
|
and $-0x40, %rax |
|
@@ -133,104 +133,100 @@ L(aligned_64_loop): |
|
pminub %xmm0, %xmm2 |
|
pcmpeqd %xmm3, %xmm2 |
|
pmovmskb %xmm2, %edx |
|
+ addq $64, %rax |
|
test %edx, %edx |
|
- lea 64(%rax), %rax |
|
jz L(aligned_64_loop) |
|
|
|
pcmpeqd -64(%rax), %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $48, %rdi |
|
test %edx, %edx |
|
- lea 48(%rcx), %rcx |
|
jnz L(exit) |
|
|
|
pcmpeqd %xmm1, %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $-16, %rdi |
|
test %edx, %edx |
|
- lea -16(%rcx), %rcx |
|
jnz L(exit) |
|
|
|
pcmpeqd -32(%rax), %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $-16, %rdi |
|
test %edx, %edx |
|
- lea -16(%rcx), %rcx |
|
jnz L(exit) |
|
|
|
pcmpeqd %xmm6, %xmm3 |
|
pmovmskb %xmm3, %edx |
|
+ addq $-16, %rdi |
|
test %edx, %edx |
|
- lea -16(%rcx), %rcx |
|
- jnz L(exit) |
|
- |
|
- jmp L(aligned_64_loop) |
|
+ jz L(aligned_64_loop) |
|
|
|
.p2align 4 |
|
L(exit): |
|
- sub %rcx, %rax |
|
+ sub %rdi, %rax |
|
shr $2, %rax |
|
test %dl, %dl |
|
jz L(exit_high) |
|
|
|
- mov %dl, %cl |
|
- and $15, %cl |
|
+ andl $15, %edx |
|
jz L(exit_1) |
|
ret |
|
|
|
- .p2align 4 |
|
+ /* No align here. Naturally aligned % 16 == 1. */ |
|
L(exit_high): |
|
- mov %dh, %ch |
|
- and $15, %ch |
|
+ andl $(15 << 8), %edx |
|
jz L(exit_3) |
|
add $2, %rax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_1): |
|
add $1, %rax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_3): |
|
add $3, %rax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail0): |
|
- xor %rax, %rax |
|
+ xorl %eax, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail1): |
|
- mov $1, %rax |
|
+ movl $1, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail2): |
|
- mov $2, %rax |
|
+ movl $2, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail3): |
|
- mov $3, %rax |
|
+ movl $3, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail4): |
|
- mov $4, %rax |
|
+ movl $4, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail5): |
|
- mov $5, %rax |
|
+ movl $5, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail6): |
|
- mov $6, %rax |
|
+ movl $6, %eax |
|
ret |
|
|
|
- .p2align 4 |
|
+ .p2align 3 |
|
L(exit_tail7): |
|
- mov $7, %rax |
|
+ movl $7, %eax |
|
ret |
|
|
|
END (__wcslen)
|
|
|