You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
412 lines
14 KiB
412 lines
14 KiB
7 years ago
|
# commit db9b4570c5dc550074140ac1d1677077fba29a26
|
||
|
# Author: Alan Modra <amodra@gmail.com>
|
||
|
# Date: Sat Aug 17 18:40:11 2013 +0930
|
||
|
#
|
||
|
# PowerPC LE strlen
|
||
|
# http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html
|
||
|
#
|
||
|
# This is the first of nine patches adding little-endian support to the
|
||
|
# existing optimised string and memory functions. I did spend some
|
||
|
# time with a power7 simulator looking at cycle by cycle behaviour for
|
||
|
# memchr, but most of these patches have not been run on cpu simulators
|
||
|
# to check that we are going as fast as possible. I'm sure PowerPC can
|
||
|
# do better. However, the little-endian support mostly leaves main
|
||
|
# loops unchanged, so I'm banking on previous authors having done a
|
||
|
# good job on big-endian.. As with most code you stare at long enough,
|
||
|
# I found some improvements for big-endian too.
|
||
|
#
|
||
|
# Little-endian support for strlen. Like most of the string functions,
|
||
|
# I leave the main word or multiple-word loops substantially unchanged,
|
||
|
# just needing to modify the tail.
|
||
|
#
|
||
|
# Removing the branch in the power7 functions is just a tidy. .align
|
||
|
# produces a branch anyway. Modifying regs in the non-power7 functions
|
||
|
# is to suit the new little-endian tail.
|
||
|
#
|
||
|
# * sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
|
||
|
# support. Don't branch over align.
|
||
|
# * sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
|
||
|
# * sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support.
|
||
|
# Rearrange tmp reg use to suit. Comment.
|
||
|
# * sysdeps/powerpc/powerpc32/strlen.S: Likewise.
|
||
|
#
|
||
|
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S
|
||
|
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500
|
||
|
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500
|
||
|
@@ -31,7 +31,11 @@
|
||
|
li r0,0 /* Word with null chars to use with cmpb. */
|
||
|
li r5,-1 /* MASK = 0xffffffffffffffff. */
|
||
|
lwz r12,0(r4) /* Load word from memory. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ slw r5,r5,r6
|
||
|
+#else
|
||
|
srw r5,r5,r6 /* MASK = MASK >> padding. */
|
||
|
+#endif
|
||
|
orc r9,r12,r5 /* Mask bits that are not part of the string. */
|
||
|
cmpb r10,r9,r0 /* Check for null bytes in WORD1. */
|
||
|
cmpwi cr7,r10,0 /* If r10 == 0, no null's have been found. */
|
||
|
@@ -49,9 +53,6 @@
|
||
|
cmpb r10,r12,r0
|
||
|
cmpwi cr7,r10,0
|
||
|
bne cr7,L(done)
|
||
|
- b L(loop) /* We branch here (rather than falling through)
|
||
|
- to skip the nops due to heavy alignment
|
||
|
- of the loop below. */
|
||
|
|
||
|
/* Main loop to look for the end of the string. Since it's a
|
||
|
small loop (< 8 instructions), align it to 32-bytes. */
|
||
|
@@ -88,9 +89,15 @@
|
||
|
0xff in the same position as the null byte in the original
|
||
|
word from the string. Use that to calculate the length. */
|
||
|
L(done):
|
||
|
- cntlzw r0,r10 /* Count leading zeroes before the match. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
|
||
|
+ andc r9, r9, r10
|
||
|
+ popcntw r0, r9 /* Count the bits in the mask. */
|
||
|
+#else
|
||
|
+ cntlzw r0,r10 /* Count leading zeros before the match. */
|
||
|
+#endif
|
||
|
subf r5,r3,r4
|
||
|
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
|
||
|
+ srwi r0,r0,3 /* Convert leading zeros to bytes. */
|
||
|
add r3,r5,r0 /* Compute final length. */
|
||
|
blr
|
||
|
END (BP_SYM (strlen))
|
||
|
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S
|
||
|
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:28:44.000000000 -0500
|
||
|
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc32/strlen.S 2014-05-28 12:32:24.000000000 -0500
|
||
|
@@ -31,7 +31,12 @@
|
||
|
1 is subtracted you get a value in the range 0x00-0x7f, none of which
|
||
|
have their high bit set. The expression here is
|
||
|
(x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
|
||
|
- there were no 0x00 bytes in the word.
|
||
|
+ there were no 0x00 bytes in the word. You get 0x80 in bytes that
|
||
|
+ match, but possibly false 0x80 matches in the next more significant
|
||
|
+ byte to a true match due to carries. For little-endian this is
|
||
|
+ of no consequence since the least significant match is the one
|
||
|
+ we're interested in, but big-endian needs method 2 to find which
|
||
|
+ byte matches.
|
||
|
|
||
|
2) Given a word 'x', we can test to see _which_ byte was zero by
|
||
|
calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
|
||
|
@@ -74,7 +79,7 @@
|
||
|
|
||
|
ENTRY (BP_SYM (strlen))
|
||
|
|
||
|
-#define rTMP1 r0
|
||
|
+#define rTMP4 r0
|
||
|
#define rRTN r3 /* incoming STR arg, outgoing result */
|
||
|
#define rSTR r4 /* current string position */
|
||
|
#define rPADN r5 /* number of padding bits we prepend to the
|
||
|
@@ -84,9 +89,9 @@
|
||
|
#define rWORD1 r8 /* current string word */
|
||
|
#define rWORD2 r9 /* next string word */
|
||
|
#define rMASK r9 /* mask for first string word */
|
||
|
-#define rTMP2 r10
|
||
|
-#define rTMP3 r11
|
||
|
-#define rTMP4 r12
|
||
|
+#define rTMP1 r10
|
||
|
+#define rTMP2 r11
|
||
|
+#define rTMP3 r12
|
||
|
|
||
|
CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2)
|
||
|
|
||
|
@@ -96,15 +101,20 @@
|
||
|
lwz rWORD1, 0(rSTR)
|
||
|
li rMASK, -1
|
||
|
addi r7F7F, r7F7F, 0x7f7f
|
||
|
-/* That's the setup done, now do the first pair of words.
|
||
|
- We make an exception and use method (2) on the first two words, to reduce
|
||
|
- overhead. */
|
||
|
+/* We use method (2) on the first two words, because rFEFE isn't
|
||
|
+ required which reduces setup overhead. Also gives a faster return
|
||
|
+ for small strings on big-endian due to needing to recalculate with
|
||
|
+ method (2) anyway. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ slw rMASK, rMASK, rPADN
|
||
|
+#else
|
||
|
srw rMASK, rMASK, rPADN
|
||
|
+#endif
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
or rTMP2, r7F7F, rWORD1
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- nor rTMP1, rTMP2, rTMP1
|
||
|
- and. rWORD1, rTMP1, rMASK
|
||
|
+ nor rTMP3, rTMP2, rTMP1
|
||
|
+ and. rTMP3, rTMP3, rMASK
|
||
|
mtcrf 0x01, rRTN
|
||
|
bne L(done0)
|
||
|
lis rFEFE, -0x101
|
||
|
@@ -113,11 +123,12 @@
|
||
|
bt 29, L(loop)
|
||
|
|
||
|
/* Handle second word of pair. */
|
||
|
+/* Perhaps use method (1) here for little-endian, saving one instruction? */
|
||
|
lwzu rWORD1, 4(rSTR)
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
or rTMP2, r7F7F, rWORD1
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- nor. rWORD1, rTMP2, rTMP1
|
||
|
+ nor. rTMP3, rTMP2, rTMP1
|
||
|
bne L(done0)
|
||
|
|
||
|
/* The loop. */
|
||
|
@@ -131,29 +142,53 @@
|
||
|
add rTMP3, rFEFE, rWORD2
|
||
|
nor rTMP4, r7F7F, rWORD2
|
||
|
bne L(done1)
|
||
|
- and. rTMP1, rTMP3, rTMP4
|
||
|
+ and. rTMP3, rTMP3, rTMP4
|
||
|
beq L(loop)
|
||
|
|
||
|
+#ifndef __LITTLE_ENDIAN__
|
||
|
and rTMP1, r7F7F, rWORD2
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- andc rWORD1, rTMP4, rTMP1
|
||
|
+ andc rTMP3, rTMP4, rTMP1
|
||
|
b L(done0)
|
||
|
|
||
|
L(done1):
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
subi rSTR, rSTR, 4
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- andc rWORD1, rTMP2, rTMP1
|
||
|
+ andc rTMP3, rTMP2, rTMP1
|
||
|
|
||
|
/* When we get to here, rSTR points to the first word in the string that
|
||
|
- contains a zero byte, and the most significant set bit in rWORD1 is in that
|
||
|
- byte. */
|
||
|
+ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
|
||
|
+ and 0x00 otherwise. */
|
||
|
L(done0):
|
||
|
- cntlzw rTMP3, rWORD1
|
||
|
+ cntlzw rTMP3, rTMP3
|
||
|
subf rTMP1, rRTN, rSTR
|
||
|
srwi rTMP3, rTMP3, 3
|
||
|
add rRTN, rTMP1, rTMP3
|
||
|
/* GKM FIXME: check high bound. */
|
||
|
blr
|
||
|
+#else
|
||
|
+
|
||
|
+L(done0):
|
||
|
+ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
|
||
|
+ andc rTMP1, rTMP1, rTMP3
|
||
|
+ cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */
|
||
|
+ subf rTMP3, rRTN, rSTR
|
||
|
+ subfic rTMP1, rTMP1, 32-7
|
||
|
+ srwi rTMP1, rTMP1, 3
|
||
|
+ add rRTN, rTMP1, rTMP3
|
||
|
+ blr
|
||
|
+
|
||
|
+L(done1):
|
||
|
+ addi rTMP3, rTMP1, -1
|
||
|
+ andc rTMP3, rTMP3, rTMP1
|
||
|
+ cntlzw rTMP3, rTMP3
|
||
|
+ subf rTMP1, rRTN, rSTR
|
||
|
+ subfic rTMP3, rTMP3, 32-7-32
|
||
|
+ srawi rTMP3, rTMP3, 3
|
||
|
+ add rRTN, rTMP1, rTMP3
|
||
|
+ blr
|
||
|
+#endif
|
||
|
+
|
||
|
END (BP_SYM (strlen))
|
||
|
libc_hidden_builtin_def (strlen)
|
||
|
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S
|
||
|
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:44.000000000 -0500
|
||
|
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/power7/strlen.S 2014-05-28 12:28:45.000000000 -0500
|
||
|
@@ -32,7 +32,11 @@
|
||
|
with cmpb. */
|
||
|
li r5,-1 /* MASK = 0xffffffffffffffff. */
|
||
|
ld r12,0(r4) /* Load doubleword from memory. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ sld r5,r5,r6
|
||
|
+#else
|
||
|
srd r5,r5,r6 /* MASK = MASK >> padding. */
|
||
|
+#endif
|
||
|
orc r9,r12,r5 /* Mask bits that are not part of the string. */
|
||
|
cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
|
||
|
cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
|
||
|
@@ -50,9 +54,6 @@
|
||
|
cmpb r10,r12,r0
|
||
|
cmpdi cr7,r10,0
|
||
|
bne cr7,L(done)
|
||
|
- b L(loop) /* We branch here (rather than falling through)
|
||
|
- to skip the nops due to heavy alignment
|
||
|
- of the loop below. */
|
||
|
|
||
|
/* Main loop to look for the end of the string. Since it's a
|
||
|
small loop (< 8 instructions), align it to 32-bytes. */
|
||
|
@@ -89,9 +90,15 @@
|
||
|
0xff in the same position as the null byte in the original
|
||
|
doubleword from the string. Use that to calculate the length. */
|
||
|
L(done):
|
||
|
- cntlzd r0,r10 /* Count leading zeroes before the match. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
|
||
|
+ andc r9, r9, r10
|
||
|
+ popcntd r0, r9 /* Count the bits in the mask. */
|
||
|
+#else
|
||
|
+ cntlzd r0,r10 /* Count leading zeros before the match. */
|
||
|
+#endif
|
||
|
subf r5,r3,r4
|
||
|
- srdi r0,r0,3 /* Convert leading zeroes to bytes. */
|
||
|
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
|
||
|
add r3,r5,r0 /* Compute final length. */
|
||
|
blr
|
||
|
END (BP_SYM (strlen))
|
||
|
diff -urN glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S
|
||
|
--- glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:28:44.000000000 -0500
|
||
|
+++ glibc-2.17-c758a686/sysdeps/powerpc/powerpc64/strlen.S 2014-05-28 12:38:17.000000000 -0500
|
||
|
@@ -31,7 +31,12 @@
|
||
|
1 is subtracted you get a value in the range 0x00-0x7f, none of which
|
||
|
have their high bit set. The expression here is
|
||
|
(x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
|
||
|
- there were no 0x00 bytes in the word.
|
||
|
+ there were no 0x00 bytes in the word. You get 0x80 in bytes that
|
||
|
+ match, but possibly false 0x80 matches in the next more significant
|
||
|
+ byte to a true match due to carries. For little-endian this is
|
||
|
+ of no consequence since the least significant match is the one
|
||
|
+ we're interested in, but big-endian needs method 2 to find which
|
||
|
+ byte matches.
|
||
|
|
||
|
2) Given a word 'x', we can test to see _which_ byte was zero by
|
||
|
calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
|
||
|
@@ -64,7 +69,7 @@
|
||
|
Answer:
|
||
|
1) Added a Data Cache Block Touch early to prefetch the first 128
|
||
|
byte cache line. Adding dcbt instructions to the loop would not be
|
||
|
- effective since most strings will be shorter than the cache line.*/
|
||
|
+ effective since most strings will be shorter than the cache line. */
|
||
|
|
||
|
/* Some notes on register usage: Under the SVR4 ABI, we can use registers
|
||
|
0 and 3 through 12 (so long as we don't call any procedures) without
|
||
|
@@ -80,7 +85,7 @@
|
||
|
ENTRY (BP_SYM (strlen))
|
||
|
CALL_MCOUNT 1
|
||
|
|
||
|
-#define rTMP1 r0
|
||
|
+#define rTMP4 r0
|
||
|
#define rRTN r3 /* incoming STR arg, outgoing result */
|
||
|
#define rSTR r4 /* current string position */
|
||
|
#define rPADN r5 /* number of padding bits we prepend to the
|
||
|
@@ -90,9 +95,9 @@
|
||
|
#define rWORD1 r8 /* current string doubleword */
|
||
|
#define rWORD2 r9 /* next string doubleword */
|
||
|
#define rMASK r9 /* mask for first string doubleword */
|
||
|
-#define rTMP2 r10
|
||
|
-#define rTMP3 r11
|
||
|
-#define rTMP4 r12
|
||
|
+#define rTMP1 r10
|
||
|
+#define rTMP2 r11
|
||
|
+#define rTMP3 r12
|
||
|
|
||
|
/* Note: The Bounded pointer support in this code is broken. This code
|
||
|
was inherited from PPC32 and that support was never completed.
|
||
|
@@ -109,30 +114,36 @@
|
||
|
addi r7F7F, r7F7F, 0x7f7f
|
||
|
li rMASK, -1
|
||
|
insrdi r7F7F, r7F7F, 32, 0
|
||
|
-/* That's the setup done, now do the first pair of doublewords.
|
||
|
- We make an exception and use method (2) on the first two doublewords,
|
||
|
- to reduce overhead. */
|
||
|
- srd rMASK, rMASK, rPADN
|
||
|
+/* We use method (2) on the first two doublewords, because rFEFE isn't
|
||
|
+ required which reduces setup overhead. Also gives a faster return
|
||
|
+ for small strings on big-endian due to needing to recalculate with
|
||
|
+ method (2) anyway. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ sld rMASK, rMASK, rPADN
|
||
|
+#else
|
||
|
+ srd rMASK, rMASK, rPADN
|
||
|
+#endif
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
or rTMP2, r7F7F, rWORD1
|
||
|
lis rFEFE, -0x101
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
addi rFEFE, rFEFE, -0x101
|
||
|
- nor rTMP1, rTMP2, rTMP1
|
||
|
- and. rWORD1, rTMP1, rMASK
|
||
|
+ nor rTMP3, rTMP2, rTMP1
|
||
|
+ and. rTMP3, rTMP3, rMASK
|
||
|
mtcrf 0x01, rRTN
|
||
|
bne L(done0)
|
||
|
- sldi rTMP1, rFEFE, 32
|
||
|
- add rFEFE, rFEFE, rTMP1
|
||
|
+ sldi rTMP1, rFEFE, 32
|
||
|
+ add rFEFE, rFEFE, rTMP1
|
||
|
/* Are we now aligned to a doubleword boundary? */
|
||
|
bt 28, L(loop)
|
||
|
|
||
|
/* Handle second doubleword of pair. */
|
||
|
+/* Perhaps use method (1) here for little-endian, saving one instruction? */
|
||
|
ldu rWORD1, 8(rSTR)
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
or rTMP2, r7F7F, rWORD1
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- nor. rWORD1, rTMP2, rTMP1
|
||
|
+ nor. rTMP3, rTMP2, rTMP1
|
||
|
bne L(done0)
|
||
|
|
||
|
/* The loop. */
|
||
|
@@ -146,29 +157,53 @@
|
||
|
add rTMP3, rFEFE, rWORD2
|
||
|
nor rTMP4, r7F7F, rWORD2
|
||
|
bne L(done1)
|
||
|
- and. rTMP1, rTMP3, rTMP4
|
||
|
+ and. rTMP3, rTMP3, rTMP4
|
||
|
beq L(loop)
|
||
|
|
||
|
+#ifndef __LITTLE_ENDIAN__
|
||
|
and rTMP1, r7F7F, rWORD2
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- andc rWORD1, rTMP4, rTMP1
|
||
|
+ andc rTMP3, rTMP4, rTMP1
|
||
|
b L(done0)
|
||
|
|
||
|
L(done1):
|
||
|
and rTMP1, r7F7F, rWORD1
|
||
|
subi rSTR, rSTR, 8
|
||
|
add rTMP1, rTMP1, r7F7F
|
||
|
- andc rWORD1, rTMP2, rTMP1
|
||
|
+ andc rTMP3, rTMP2, rTMP1
|
||
|
|
||
|
/* When we get to here, rSTR points to the first doubleword in the string that
|
||
|
- contains a zero byte, and the most significant set bit in rWORD1 is in that
|
||
|
- byte. */
|
||
|
+ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
|
||
|
+ otherwise. */
|
||
|
L(done0):
|
||
|
- cntlzd rTMP3, rWORD1
|
||
|
+ cntlzd rTMP3, rTMP3
|
||
|
subf rTMP1, rRTN, rSTR
|
||
|
srdi rTMP3, rTMP3, 3
|
||
|
add rRTN, rTMP1, rTMP3
|
||
|
/* GKM FIXME: check high bound. */
|
||
|
blr
|
||
|
+#else
|
||
|
+
|
||
|
+L(done0):
|
||
|
+ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
|
||
|
+ andc rTMP1, rTMP1, rTMP3
|
||
|
+ cntlzd rTMP1, rTMP1 /* Count bits not in the mask. */
|
||
|
+ subf rTMP3, rRTN, rSTR
|
||
|
+ subfic rTMP1, rTMP1, 64-7
|
||
|
+ srdi rTMP1, rTMP1, 3
|
||
|
+ add rRTN, rTMP1, rTMP3
|
||
|
+ blr
|
||
|
+
|
||
|
+L(done1):
|
||
|
+ addi rTMP3, rTMP1, -1
|
||
|
+ andc rTMP3, rTMP3, rTMP1
|
||
|
+ cntlzd rTMP3, rTMP3
|
||
|
+ subf rTMP1, rRTN, rSTR
|
||
|
+ subfic rTMP3, rTMP3, 64-7-64
|
||
|
+ sradi rTMP3, rTMP3, 3
|
||
|
+ add rRTN, rTMP1, rTMP3
|
||
|
+ blr
|
||
|
+#endif
|
||
|
+
|
||
|
END (BP_SYM (strlen))
|
||
|
libc_hidden_builtin_def (strlen)
|