You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
190 lines
5.0 KiB
190 lines
5.0 KiB
From 2ffa8b8660a7a17572ae5a398171c8be59985eb3 Mon Sep 17 00:00:00 2001 |
|
From: "Gabriel F. T. Gomes" <gftg@linux.vnet.ibm.com> |
|
Date: Mon, 25 Jan 2016 10:50:34 -0500 |
|
Subject: [PATCH] powerpc: Zero pad using memset in strncpy/stpncpy |
|
|
|
Call __memset_power8 to pad, with zeros, the remaining bytes in the |
|
dest string on __strncpy_power8 and __stpncpy_power8. This improves |
|
performance when n is larger than the input string, giving ~30% gain for |
|
larger strings without impacting much shorter strings. |
|
|
|
(cherry picked from commit 72c11b353ede72931cc474c9071d143d9a05c0d7) |
|
--- |
|
ChangeLog | 5 ++ |
|
sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- |
|
2 files changed, 61 insertions(+), 67 deletions(-) |
|
|
|
diff --git a/ChangeLog b/ChangeLog |
|
index 5537fc6..8d0e296 100644 |
|
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S |
|
index 5fda953..80136cc 100644 |
|
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S |
|
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S |
|
@@ -24,6 +24,8 @@ |
|
# define FUNC_NAME strncpy |
|
#endif |
|
|
|
+#define FRAMESIZE (FRAME_MIN_SIZE+48) |
|
+ |
|
/* Implements the function |
|
|
|
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) |
|
@@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) |
|
addi r10,r4,16 |
|
rlwinm r9,r4,0,19,19 |
|
|
|
- /* Since it is a leaf function, save some non-volatile registers on the |
|
- protected/red zone. */ |
|
+ /* Save some non-volatile registers on the stack. */ |
|
std r26,-48(r1) |
|
std r27,-40(r1) |
|
|
|
@@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) |
|
std r30,-16(r1) |
|
std r31,-8(r1) |
|
|
|
+ /* Update CFI. */ |
|
+ cfi_offset(r26, -48) |
|
+ cfi_offset(r27, -40) |
|
+ cfi_offset(r28, -32) |
|
+ cfi_offset(r29, -24) |
|
+ cfi_offset(r30, -16) |
|
+ cfi_offset(r31, -8) |
|
+ |
|
beq cr7,L(unaligned_lt_16) |
|
rldicl r9,r4,0,61 |
|
subfic r8,r9,8 |
|
@@ -144,74 +153,58 @@ L(short_path_loop_end): |
|
ld r31,-8(r1) |
|
blr |
|
|
|
- /* This code pads the remainder dest with NULL bytes. The algorithm |
|
- calculate the remanining size and issues a doubleword unrolled |
|
- loops followed by a byte a byte set. */ |
|
+ /* This code pads the remainder of dest with NULL bytes. The algorithm |
|
+ calculates the remaining size and calls memset. */ |
|
.align 4 |
|
L(zero_pad_start): |
|
mr r5,r10 |
|
mr r9,r6 |
|
L(zero_pad_start_1): |
|
- srdi. r8,r5,r3 |
|
- mr r10,r9 |
|
-#ifdef USE_AS_STPNCPY |
|
- mr r3,r9 |
|
+ /* At this point: |
|
+ - r5 holds the number of bytes that still have to be written to |
|
+ dest. |
|
+ - r9 points to the position, in dest, where the first null byte |
|
+ will be written. |
|
+ The above statements are true both when control reaches this label |
|
+ from a branch or when falling through the previous lines. */ |
|
+#ifndef USE_AS_STPNCPY |
|
+ mr r30,r3 /* Save the return value of strncpy. */ |
|
+#endif |
|
+ /* Prepare the call to memset. */ |
|
+ mr r3,r9 /* Pointer to the area to be zero-filled. */ |
|
+ li r4,0 /* Byte to be written (zero). */ |
|
+ |
|
+ /* We delayed the creation of the stack frame, as well as the saving of |
|
+ the link register, because only at this point, we are sure that |
|
+ doing so is actually needed. */ |
|
+ |
|
+ /* Save the link register. */ |
|
+ mflr r0 |
|
+ std r0,16(r1) |
|
+ cfi_offset(lr, 16) |
|
+ |
|
+ /* Create the stack frame. */ |
|
+ stdu r1,-FRAMESIZE(r1) |
|
+ cfi_adjust_cfa_offset(FRAMESIZE) |
|
+ |
|
+ bl __memset_power8 |
|
+ nop |
|
+ |
|
+ /* Restore the stack frame. */ |
|
+ addi r1,r1,FRAMESIZE |
|
+ cfi_adjust_cfa_offset(-FRAMESIZE) |
|
+ /* Restore the link register. */ |
|
+ ld r0,16(r1) |
|
+ mtlr r0 |
|
+ |
|
+#ifndef USE_AS_STPNCPY |
|
+ mr r3,r30 /* Restore the return value of strncpy, i.e.: |
|
+ dest. For stpncpy, the return value is the |
|
+ same as return value of memset. */ |
|
#endif |
|
- beq- cr0,L(zero_pad_loop_b_start) |
|
- cmpldi cr7,r8,1 |
|
- li cr7,0 |
|
- std r7,0(r9) |
|
- beq cr7,L(zero_pad_loop_b_prepare) |
|
- addic. r8,r8,-2 |
|
- addi r10,r9,r16 |
|
- std r7,8(r9) |
|
- beq cr0,L(zero_pad_loop_dw_2) |
|
- std r7,16(r9) |
|
- li r9,0 |
|
- b L(zero_pad_loop_dw_1) |
|
- |
|
- .align 4 |
|
-L(zero_pad_loop_dw): |
|
- addi r10,r10,16 |
|
- std r9,-8(r10) |
|
- beq cr0,L(zero_pad_loop_dw_2) |
|
- std r9,0(r10) |
|
-L(zero_pad_loop_dw_1): |
|
- cmpldi cr7,r8,1 |
|
- std r9,0(r10) |
|
- addic. r8,r8,-2 |
|
- bne cr7,L(zero_pad_loop_dw) |
|
- addi r10,r10,8 |
|
-L(zero_pad_loop_dw_2): |
|
- rldicl r5,r5,0,61 |
|
-L(zero_pad_loop_b_start): |
|
- cmpdi cr7,r5,0 |
|
- addi r5,r5,-1 |
|
- addi r9,r10,-1 |
|
- add r10,r10,5 |
|
- subf r10,r9,r10 |
|
- li r8,0 |
|
- beq- cr7,L(short_path_loop_end) |
|
- |
|
- /* Write remaining 1-8 bytes. */ |
|
- .align 4 |
|
- addi r9,r9,1 |
|
- mtocrf 0x1,r10 |
|
- bf 29,4f |
|
- stw r8,0(r9) |
|
- addi r9,r9,4 |
|
- |
|
- .align 4 |
|
-4: bf 30,2f |
|
- sth r8,0(r9) |
|
- addi r9,r9,2 |
|
- |
|
- .align 4 |
|
-2: bf 31,1f |
|
- stb r8,0(r9) |
|
|
|
- /* Restore non-volatile registers. */ |
|
-1: ld r26,-48(r1) |
|
+ /* Restore non-volatile registers and return. */ |
|
+ ld r26,-48(r1) |
|
ld r27,-40(r1) |
|
ld r28,-32(r1) |
|
ld r29,-24(r1) |
|
@@ -407,10 +400,6 @@ L(short_path_prepare_2_3): |
|
mr r4,r28 |
|
mr r9,r29 |
|
b L(short_path_2) |
|
-L(zero_pad_loop_b_prepare): |
|
- addi r10,r9,8 |
|
- rldicl r5,r5,0,61 |
|
- b L(zero_pad_loop_b_start) |
|
L(zero_pad_start_prepare_1): |
|
mr r5,r6 |
|
mr r9,r8 |
|
-- |
|
2.1.0 |
|
|
|
|