|
|
|
The memmove related fix is dropped in this patch because rhel-7.5
|
|
|
|
does not have optimized memmove for POWER7.
|
|
|
|
|
|
|
|
commit 63da5cd4a097d089033d980c42254c3356fa723f
|
|
|
|
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
|
|
|
|
Date: Wed Oct 25 13:13:53 2017 -0200
|
|
|
|
|
|
|
|
powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
|
|
|
|
|
|
|
|
POWER9 DD2.1 and earlier has an issue where some cache inhibited
|
|
|
|
vector load traps to the kernel, causing a performance degradation. To
|
|
|
|
handle this in memcpy and memmove, lvx/stvx is used for aligned
|
|
|
|
addresses instead of lxvd2x/stxvd2x.
|
|
|
|
|
|
|
|
Reference: https://patchwork.ozlabs.org/patch/814059/
|
|
|
|
|
|
|
|
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
|
|
|
|
lxvd2x/stxvd2x with lvx/stvx.
|
|
|
|
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
|
|
|
|
|
|
|
|
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
|
|
|
|
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
|
|
|
|
|
|
|
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
|
|
|
|
index 1ccbc2e..a7cdf8b 100644
|
|
|
|
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
|
|
|
|
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
|
|
|
|
@@ -91,63 +91,63 @@ L(aligned_copy):
|
|
|
|
srdi 12,cnt,7
|
|
|
|
cmpdi 12,0
|
|
|
|
beq L(aligned_tail)
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
- lxvd2x 7,src,6
|
|
|
|
+ lvx 6,0,src
|
|
|
|
+ lvx 7,src,6
|
|
|
|
mtctr 12
|
|
|
|
b L(aligned_128loop)
|
|
|
|
|
|
|
|
.align 4
|
|
|
|
L(aligned_128head):
|
|
|
|
/* for the 2nd + iteration of this loop. */
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
- lxvd2x 7,src,6
|
|
|
|
+ lvx 6,0,src
|
|
|
|
+ lvx 7,src,6
|
|
|
|
L(aligned_128loop):
|
|
|
|
- lxvd2x 8,src,7
|
|
|
|
- lxvd2x 9,src,8
|
|
|
|
- stxvd2x 6,0,dst
|
|
|
|
+ lvx 8,src,7
|
|
|
|
+ lvx 9,src,8
|
|
|
|
+ stvx 6,0,dst
|
|
|
|
addi src,src,64
|
|
|
|
- stxvd2x 7,dst,6
|
|
|
|
- stxvd2x 8,dst,7
|
|
|
|
- stxvd2x 9,dst,8
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
- lxvd2x 7,src,6
|
|
|
|
+ stvx 7,dst,6
|
|
|
|
+ stvx 8,dst,7
|
|
|
|
+ stvx 9,dst,8
|
|
|
|
+ lvx 6,0,src
|
|
|
|
+ lvx 7,src,6
|
|
|
|
addi dst,dst,64
|
|
|
|
- lxvd2x 8,src,7
|
|
|
|
- lxvd2x 9,src,8
|
|
|
|
+ lvx 8,src,7
|
|
|
|
+ lvx 9,src,8
|
|
|
|
addi src,src,64
|
|
|
|
- stxvd2x 6,0,dst
|
|
|
|
- stxvd2x 7,dst,6
|
|
|
|
- stxvd2x 8,dst,7
|
|
|
|
- stxvd2x 9,dst,8
|
|
|
|
+ stvx 6,0,dst
|
|
|
|
+ stvx 7,dst,6
|
|
|
|
+ stvx 8,dst,7
|
|
|
|
+ stvx 9,dst,8
|
|
|
|
addi dst,dst,64
|
|
|
|
bdnz L(aligned_128head)
|
|
|
|
|
|
|
|
L(aligned_tail):
|
|
|
|
mtocrf 0x01,cnt
|
|
|
|
bf 25,32f
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
- lxvd2x 7,src,6
|
|
|
|
- lxvd2x 8,src,7
|
|
|
|
- lxvd2x 9,src,8
|
|
|
|
+ lvx 6,0,src
|
|
|
|
+ lvx 7,src,6
|
|
|
|
+ lvx 8,src,7
|
|
|
|
+ lvx 9,src,8
|
|
|
|
addi src,src,64
|
|
|
|
- stxvd2x 6,0,dst
|
|
|
|
- stxvd2x 7,dst,6
|
|
|
|
- stxvd2x 8,dst,7
|
|
|
|
- stxvd2x 9,dst,8
|
|
|
|
+ stvx 6,0,dst
|
|
|
|
+ stvx 7,dst,6
|
|
|
|
+ stvx 8,dst,7
|
|
|
|
+ stvx 9,dst,8
|
|
|
|
addi dst,dst,64
|
|
|
|
32:
|
|
|
|
bf 26,16f
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
- lxvd2x 7,src,6
|
|
|
|
+ lvx 6,0,src
|
|
|
|
+ lvx 7,src,6
|
|
|
|
addi src,src,32
|
|
|
|
- stxvd2x 6,0,dst
|
|
|
|
- stxvd2x 7,dst,6
|
|
|
|
+ stvx 6,0,dst
|
|
|
|
+ stvx 7,dst,6
|
|
|
|
addi dst,dst,32
|
|
|
|
16:
|
|
|
|
bf 27,8f
|
|
|
|
- lxvd2x 6,0,src
|
|
|
|
+ lvx 6,0,src
|
|
|
|
addi src,src,16
|
|
|
|
- stxvd2x 6,0,dst
|
|
|
|
+ stvx 6,0,dst
|
|
|
|
addi dst,dst,16
|
|
|
|
8:
|
|
|
|
bf 28,4f
|