You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
123 lines
2.7 KiB
123 lines
2.7 KiB
The memmove related fix is dropped in this patch because rhel-7.5 |
|
does not have optimized memmove for POWER7. |
|
|
|
commit 63da5cd4a097d089033d980c42254c3356fa723f |
|
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> |
|
Date: Wed Oct 25 13:13:53 2017 -0200 |
|
|
|
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove |
|
|
|
POWER9 DD2.1 and earlier has an issue where some cache inhibited |
|
vector load traps to the kernel, causing a performance degradation. To |
|
handle this in memcpy and memmove, lvx/stvx is used for aligned |
|
addresses instead of lxvd2x/stxvd2x. |
|
|
|
Reference: https://patchwork.ozlabs.org/patch/814059/ |
|
|
|
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace |
|
lxvd2x/stxvd2x with lvx/stvx. |
|
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise. |
|
|
|
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> |
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> |
|
|
|
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S |
|
index 1ccbc2e..a7cdf8b 100644 |
|
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S |
|
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S |
|
@@ -91,63 +91,63 @@ L(aligned_copy): |
|
srdi 12,cnt,7 |
|
cmpdi 12,0 |
|
beq L(aligned_tail) |
|
- lxvd2x 6,0,src |
|
- lxvd2x 7,src,6 |
|
+ lvx 6,0,src |
|
+ lvx 7,src,6 |
|
mtctr 12 |
|
b L(aligned_128loop) |
|
|
|
.align 4 |
|
L(aligned_128head): |
|
/* for the 2nd + iteration of this loop. */ |
|
- lxvd2x 6,0,src |
|
- lxvd2x 7,src,6 |
|
+ lvx 6,0,src |
|
+ lvx 7,src,6 |
|
L(aligned_128loop): |
|
- lxvd2x 8,src,7 |
|
- lxvd2x 9,src,8 |
|
- stxvd2x 6,0,dst |
|
+ lvx 8,src,7 |
|
+ lvx 9,src,8 |
|
+ stvx 6,0,dst |
|
addi src,src,64 |
|
- stxvd2x 7,dst,6 |
|
- stxvd2x 8,dst,7 |
|
- stxvd2x 9,dst,8 |
|
- lxvd2x 6,0,src |
|
- lxvd2x 7,src,6 |
|
+ stvx 7,dst,6 |
|
+ stvx 8,dst,7 |
|
+ stvx 9,dst,8 |
|
+ lvx 6,0,src |
|
+ lvx 7,src,6 |
|
addi dst,dst,64 |
|
- lxvd2x 8,src,7 |
|
- lxvd2x 9,src,8 |
|
+ lvx 8,src,7 |
|
+ lvx 9,src,8 |
|
addi src,src,64 |
|
- stxvd2x 6,0,dst |
|
- stxvd2x 7,dst,6 |
|
- stxvd2x 8,dst,7 |
|
- stxvd2x 9,dst,8 |
|
+ stvx 6,0,dst |
|
+ stvx 7,dst,6 |
|
+ stvx 8,dst,7 |
|
+ stvx 9,dst,8 |
|
addi dst,dst,64 |
|
bdnz L(aligned_128head) |
|
|
|
L(aligned_tail): |
|
mtocrf 0x01,cnt |
|
bf 25,32f |
|
- lxvd2x 6,0,src |
|
- lxvd2x 7,src,6 |
|
- lxvd2x 8,src,7 |
|
- lxvd2x 9,src,8 |
|
+ lvx 6,0,src |
|
+ lvx 7,src,6 |
|
+ lvx 8,src,7 |
|
+ lvx 9,src,8 |
|
addi src,src,64 |
|
- stxvd2x 6,0,dst |
|
- stxvd2x 7,dst,6 |
|
- stxvd2x 8,dst,7 |
|
- stxvd2x 9,dst,8 |
|
+ stvx 6,0,dst |
|
+ stvx 7,dst,6 |
|
+ stvx 8,dst,7 |
|
+ stvx 9,dst,8 |
|
addi dst,dst,64 |
|
32: |
|
bf 26,16f |
|
- lxvd2x 6,0,src |
|
- lxvd2x 7,src,6 |
|
+ lvx 6,0,src |
|
+ lvx 7,src,6 |
|
addi src,src,32 |
|
- stxvd2x 6,0,dst |
|
- stxvd2x 7,dst,6 |
|
+ stvx 6,0,dst |
|
+ stvx 7,dst,6 |
|
addi dst,dst,32 |
|
16: |
|
bf 27,8f |
|
- lxvd2x 6,0,src |
|
+ lvx 6,0,src |
|
addi src,src,16 |
|
- stxvd2x 6,0,dst |
|
+ stvx 6,0,dst |
|
addi dst,dst,16 |
|
8: |
|
bf 28,4f
|
|
|