You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1273 lines
38 KiB
1273 lines
38 KiB
7 years ago
|
Backport of:
|
||
|
|
||
|
commit f06a4faf8a2b4d046eb40e94b47948cc47d79902
|
||
|
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
|
||
|
Date: Wed Dec 31 11:47:41 2014 -0500
|
||
|
|
||
|
powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
|
||
|
|
||
|
This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
|
||
|
It shows 10%-80% improvement over the optimized POWER7 one that uses
|
||
|
only aligned accesses, specially on unaligned inputs.
|
||
|
|
||
|
The algorithm first read and check 16 bytes (if inputs do not cross a 4K
|
||
|
page size). The it realign source to 16-bytes and issue a 16 bytes read
|
||
|
and compare loop to speedup null byte checks for large strings. Also,
|
||
|
different from POWER7 optimization, the null pad is done inline in the
|
||
|
implementation using possible unaligned accesses, instead of realying on
|
||
|
a memset call. Special case is added for page cross reads.
|
||
|
|
||
|
ChangeLog:
|
||
|
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
|
||
|
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
|
||
|
Add strncpy-power8 and stpncpy-power8 objects.
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
|
||
|
implementations.
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
|
||
|
__stpncpy_power8 implementation.
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
|
||
|
* sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
|
||
|
__strncpy_power8 implementation.
|
||
|
* sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
|
||
|
* sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
|
||
|
* NEWS: Update.
|
||
|
|
||
|
and its dependency:
|
||
|
|
||
|
commit f360f94a05570045be615649e9a411cefba2e210
|
||
|
Author: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
|
||
|
Date: Mon May 5 19:10:45 2014 -0500
|
||
|
|
||
|
PowerPC: strncpy/stpncpy optimization for PPC64/POWER7
|
||
|
|
||
|
The optimization is achieved by following techniques:
|
||
|
> data alignment [gain from aligned memory access on read/write]
|
||
|
> POWER7 gains performance with loop unrolling/unwinding
|
||
|
[gain by reduction of branch penalty].
|
||
|
> zero padding done by calling optimized memset
|
||
|
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
index 74ae710..ef39917 100644
|
||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
@@ -8,6 +8,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
|
||
|
stpcpy-power8 stpcpy-power7 stpcpy-ppc64 \
|
||
|
strcat-power8 strcat-power7 strcat-ppc64 \
|
||
|
strcpy-power8 strcpy-power7 strcpy-ppc64 \
|
||
|
+ stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
|
||
|
+ strncpy-power8 strncpy-power7 strncpy-ppc64
|
||
|
strncat-power7 \
|
||
|
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
|
||
|
strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
index 4e5bb17..23bf5dc 100644
|
||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
@@ -255,5 +255,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||
|
IFUNC_IMPL_ADD (array, i, wcscpy, 1,
|
||
|
__wcscpy_ppc))
|
||
|
|
||
|
+ /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
|
||
|
+ IFUNC_IMPL (i, name, strncpy,
|
||
|
+ IFUNC_IMPL_ADD (array, i, strncpy,
|
||
|
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||
|
+ __strncpy_power8)
|
||
|
+ IFUNC_IMPL_ADD (array, i, strncpy,
|
||
|
+ hwcap & PPC_FEATURE_HAS_VSX,
|
||
|
+ __strncpy_power7)
|
||
|
+ IFUNC_IMPL_ADD (array, i, strncpy, 1,
|
||
|
+ __strncpy_ppc))
|
||
|
+
|
||
|
+ /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
|
||
|
+ IFUNC_IMPL (i, name, stpncpy,
|
||
|
+ IFUNC_IMPL_ADD (array, i, stpncpy,
|
||
|
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||
|
+ __stpncpy_power8)
|
||
|
+ IFUNC_IMPL_ADD (array, i, stpncpy,
|
||
|
+ hwcap & PPC_FEATURE_HAS_VSX,
|
||
|
+ __stpncpy_power7)
|
||
|
+ IFUNC_IMPL_ADD (array, i, stpncpy, 1,
|
||
|
+ __stpncpy_ppc))
|
||
|
+
|
||
|
return i;
|
||
|
}
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
|
||
|
new file mode 100644
|
||
|
index 0000000..e29674f
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
|
||
|
@@ -0,0 +1,44 @@
|
||
|
+/* Optimized stpncpy implementation for POWER7.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+#define USE_AS_STPNCPY
|
||
|
+
|
||
|
+#undef EALIGN
|
||
|
+#define EALIGN(name, alignt, words) \
|
||
|
+ .section ".text"; \
|
||
|
+ ENTRY_2(__stpncpy_power7) \
|
||
|
+ .align ALIGNARG(alignt); \
|
||
|
+ EALIGN_W_##words; \
|
||
|
+ BODY_LABEL(__stpncpy_power7): \
|
||
|
+ cfi_startproc; \
|
||
|
+ LOCALENTRY(__stpncpy_power7)
|
||
|
+
|
||
|
+#undef END
|
||
|
+#define END(name) \
|
||
|
+ cfi_endproc; \
|
||
|
+ TRACEBACK(__stpncpy_power7) \
|
||
|
+ END_2(__stpncpy_power7)
|
||
|
+
|
||
|
+#undef libc_hidden_builtin_def
|
||
|
+#define libc_hidden_builtin_def(name)
|
||
|
+
|
||
|
+#define MEMSET __memset_power7
|
||
|
+
|
||
|
+#include <sysdeps/powerpc/powerpc64/power7/stpncpy.S>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
|
||
|
new file mode 100644
|
||
|
index 0000000..d5d835d
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
|
||
|
@@ -0,0 +1,39 @@
|
||
|
+/* Optimized stpncpy implementation for POWER8.
|
||
|
+ Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+#define USE_AS_STPNCPY
|
||
|
+
|
||
|
+#undef EALIGN
|
||
|
+#define EALIGN(name, alignt, words) \
|
||
|
+ .section ".text"; \
|
||
|
+ ENTRY_2(__stpncpy_power8) \
|
||
|
+ .align ALIGNARG(alignt); \
|
||
|
+ EALIGN_W_##words; \
|
||
|
+ BODY_LABEL(__stpncpy_power8): \
|
||
|
+ cfi_startproc; \
|
||
|
+ LOCALENTRY(__stpncpy_power8)
|
||
|
+
|
||
|
+#undef END
|
||
|
+#define END(name) \
|
||
|
+ cfi_endproc; \
|
||
|
+ TRACEBACK(__stpncpy_power8) \
|
||
|
+ END_2(__stpncpy_power8)
|
||
|
+
|
||
|
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
|
||
|
new file mode 100644
|
||
|
index 0000000..74f47a7
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
|
||
|
@@ -0,0 +1,26 @@
|
||
|
+/* Default stpncpy implementation for PowerPC64.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#define STPNCPY __stpncpy_ppc
|
||
|
+#ifdef SHARED
|
||
|
+#undef libc_hidden_def
|
||
|
+#define libc_hidden_def(name) \
|
||
|
+ __hidden_ver1 (__stpncpy_ppc, __GI___stpncpy, __stpncpy_ppc);
|
||
|
+#endif
|
||
|
+
|
||
|
+#include <string/stpncpy.c>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
|
||
|
new file mode 100644
|
||
|
index 0000000..3ee50e5
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
|
||
|
@@ -0,0 +1,36 @@
|
||
|
+/* Multiple versions of stpncpy. PowerPC64 version.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#ifndef NOT_IN_libc
|
||
|
+# include <string.h>
|
||
|
+# include <shlib-compat.h>
|
||
|
+# include "init-arch.h"
|
||
|
+
|
||
|
+extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
|
||
|
+extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
|
||
|
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
|
||
|
+
|
||
|
+libc_ifunc (__stpncpy,
|
||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||
|
+ ? __stpncpy_power8 :
|
||
|
+ (hwcap & PPC_FEATURE_HAS_VSX)
|
||
|
+ ? __stpncpy_power7
|
||
|
+ : __stpncpy_ppc);
|
||
|
+
|
||
|
+weak_alias (__stpncpy, stpncpy)
|
||
|
+#endif
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
|
||
|
new file mode 100644
|
||
|
index 0000000..be349f9
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
|
||
|
@@ -0,0 +1,42 @@
|
||
|
+/* Optimized strncpy implementation for POWER7.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+#undef EALIGN
|
||
|
+#define EALIGN(name, alignt, words) \
|
||
|
+ .section ".text"; \
|
||
|
+ ENTRY_2(__strncpy_power7) \
|
||
|
+ .align ALIGNARG(alignt); \
|
||
|
+ EALIGN_W_##words; \
|
||
|
+ BODY_LABEL(__strncpy_power7): \
|
||
|
+ cfi_startproc; \
|
||
|
+ LOCALENTRY(__strncpy_power7)
|
||
|
+
|
||
|
+#undef END
|
||
|
+#define END(name) \
|
||
|
+ cfi_endproc; \
|
||
|
+ TRACEBACK(__strncpy_power7) \
|
||
|
+ END_2(__strncpy_power7)
|
||
|
+
|
||
|
+#undef libc_hidden_builtin_def
|
||
|
+#define libc_hidden_builtin_def(name)
|
||
|
+
|
||
|
+#define MEMSET __memset_power7
|
||
|
+
|
||
|
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
|
||
|
new file mode 100644
|
||
|
index 0000000..ed906a4
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
|
||
|
@@ -0,0 +1,40 @@
|
||
|
+/* Optimized strncpy implementation for POWER8.
|
||
|
+ Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+#undef EALIGN
|
||
|
+#define EALIGN(name, alignt, words) \
|
||
|
+ .section ".text"; \
|
||
|
+ ENTRY_2(__strncpy_power8) \
|
||
|
+ .align ALIGNARG(alignt); \
|
||
|
+ EALIGN_W_##words; \
|
||
|
+ BODY_LABEL(__strncpy_power8): \
|
||
|
+ cfi_startproc; \
|
||
|
+ LOCALENTRY(__strncpy_power8)
|
||
|
+
|
||
|
+#undef END
|
||
|
+#define END(name) \
|
||
|
+ cfi_endproc; \
|
||
|
+ TRACEBACK(__strncpy_power8) \
|
||
|
+ END_2(__strncpy_power8)
|
||
|
+
|
||
|
+#undef libc_hidden_builtin_def
|
||
|
+#define libc_hidden_builtin_def(name)
|
||
|
+
|
||
|
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
|
||
|
new file mode 100644
|
||
|
index 0000000..e3111d2
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
|
||
|
@@ -0,0 +1,33 @@
|
||
|
+/* Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <string.h>
|
||
|
+
|
||
|
+#define STRNCPY __strncpy_ppc
|
||
|
+#undef weak_alias
|
||
|
+#define weak_alias(name, aliasname) \
|
||
|
+ extern __typeof (__strncpy_ppc) aliasname \
|
||
|
+ __attribute__ ((weak, alias ("__strncpy_ppc")));
|
||
|
+#if !defined(NOT_IN_libc) && defined(SHARED)
|
||
|
+# undef libc_hidden_builtin_def
|
||
|
+# define libc_hidden_builtin_def(name) \
|
||
|
+ __hidden_ver1(__strncpy_ppc, __GI_strncpy, __strncpy_ppc);
|
||
|
+#endif
|
||
|
+
|
||
|
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
|
||
|
+
|
||
|
+#include <string/strncpy.c>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
|
||
|
new file mode 100644
|
||
|
index 0000000..19927bc
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
|
||
|
@@ -0,0 +1,38 @@
|
||
|
+/* Multiple versions of strncpy.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/ >. */
|
||
|
+
|
||
|
+/* Define multiple versions only for definition in libc. */
|
||
|
+#ifndef NOT_IN_libc
|
||
|
+# include <string.h>
|
||
|
+# include <shlib-compat.h>
|
||
|
+# include "init-arch.h"
|
||
|
+
|
||
|
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
|
||
|
+extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
|
||
|
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
|
||
|
+
|
||
|
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||
|
+ ifunc symbol properly. */
|
||
|
+libc_ifunc (strncpy,
|
||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||
|
+ ? __strncpy_power8 :
|
||
|
+ (hwcap & PPC_FEATURE_HAS_VSX)
|
||
|
+ ? __strncpy_power7
|
||
|
+ : __strncpy_ppc);
|
||
|
+
|
||
|
+#endif
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
|
||
|
new file mode 100644
|
||
|
index 0000000..a539093
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
|
||
|
@@ -0,0 +1,24 @@
|
||
|
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
|
||
|
+ Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#define USE_AS_STPNCPY
|
||
|
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
|
||
|
+
|
||
|
+weak_alias (__stpncpy, stpncpy)
|
||
|
+libc_hidden_def (__stpncpy)
|
||
|
+libc_hidden_builtin_def (stpncpy)
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
|
||
|
new file mode 100644
|
||
|
index 0000000..51860df
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
|
||
|
@@ -0,0 +1,338 @@
|
||
|
+/* Copyright (C) 2014 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+/* Implements the functions
|
||
|
+
|
||
|
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
|
||
|
+
|
||
|
+ AND
|
||
|
+
|
||
|
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
|
||
|
+
|
||
|
+ The algorithm is as follows:
|
||
|
+ > if src and dest are 8 byte aligned, perform double word copy
|
||
|
+ else
|
||
|
+ > copy byte by byte on unaligned addresses.
|
||
|
+
|
||
|
+ The aligned comparison are made using cmpb instructions. */
|
||
|
+
|
||
|
+/* The focus on optimization for performance improvements are as follows:
|
||
|
+ 1. data alignment [gain from aligned memory access on read/write]
|
||
|
+ 2. POWER7 gains performance with loop unrolling/unwinding
|
||
|
+ [gain by reduction of branch penalty].
|
||
|
+ 3. The final pad with null bytes is done by calling an optimized
|
||
|
+ memset. */
|
||
|
+
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+# define FUNC_NAME __stpncpy
|
||
|
+#else
|
||
|
+# define FUNC_NAME strncpy
|
||
|
+#endif
|
||
|
+
|
||
|
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
|
||
|
+
|
||
|
+#ifndef MEMSET
|
||
|
+/* For builds with no IFUNC support, local calls should be made to internal
|
||
|
+ GLIBC symbol (created by libc_hidden_builtin_def). */
|
||
|
+# ifdef SHARED
|
||
|
+# define MEMSET __GI_memset
|
||
|
+# else
|
||
|
+# define MEMSET memset
|
||
|
+# endif
|
||
|
+#endif
|
||
|
+
|
||
|
+ .machine power7
|
||
|
+EALIGN(FUNC_NAME, 4, 0)
|
||
|
+ CALL_MCOUNT 3
|
||
|
+
|
||
|
+ mflr r0 /* load link register LR to r0 */
|
||
|
+ or r10, r3, r4 /* to verify source and destination */
|
||
|
+ rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
|
||
|
+
|
||
|
+ std r19, -8(r1) /* save callers register , r19 */
|
||
|
+ std r18, -16(r1) /* save callers register , r18 */
|
||
|
+ std r0, 16(r1) /* store the link register */
|
||
|
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
|
||
|
+
|
||
|
+ mr r9, r3 /* save r3 into r9 for use */
|
||
|
+ mr r18, r3 /* save r3 for retCode of strncpy */
|
||
|
+ bne 0, L(byte_by_byte)
|
||
|
+
|
||
|
+
|
||
|
+ srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
|
||
|
+ cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
|
||
|
+ ble 7, L(update1)
|
||
|
+
|
||
|
+ ld r10, 0(r4) /* load doubleWord from src */
|
||
|
+ cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
|
||
|
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
|
||
|
+ bne cr7, L(update3)
|
||
|
+
|
||
|
+ std r10, 0(r3) /* copy doubleword at offset=0 */
|
||
|
+ ld r10, 8(r4) /* load next doubleword from offset=8 */
|
||
|
+ cmpb r8, r10, r8 /* compare src with NULL , we read just now */
|
||
|
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
|
||
|
+ bne 7,L(HopBy8)
|
||
|
+
|
||
|
+ addi r8, r11, -4
|
||
|
+ mr r7, r3
|
||
|
+ srdi r8, r8, 2
|
||
|
+ mr r6, r4
|
||
|
+ addi r8, r8, 1
|
||
|
+ li r12, 0
|
||
|
+ mtctr r8
|
||
|
+ b L(dwordCopy)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(dWordUnroll):
|
||
|
+ std r8, 16(r9)
|
||
|
+ ld r8, 24(r4) /* load dword,perform loop unrolling again */
|
||
|
+ cmpb r10, r8, r10
|
||
|
+ cmpdi cr7, r10, 0
|
||
|
+ bne cr7, L(HopBy24)
|
||
|
+
|
||
|
+ std r8, 24(r7) /* copy dword at offset=24 */
|
||
|
+ addi r9, r9, 32
|
||
|
+ addi r4, r4, 32
|
||
|
+ bdz L(leftDwords) /* continue with loop on counter */
|
||
|
+
|
||
|
+ ld r3, 32(r6)
|
||
|
+ cmpb r8, r3, r10
|
||
|
+ cmpdi cr7, r8, 0
|
||
|
+ bne cr7, L(update2)
|
||
|
+
|
||
|
+ std r3, 32(r7)
|
||
|
+ ld r10, 40(r6)
|
||
|
+ cmpb r8, r10, r8
|
||
|
+ cmpdi cr7, r8, 0
|
||
|
+ bne cr7, L(HopBy40)
|
||
|
+
|
||
|
+ mr r6, r4 /* update values */
|
||
|
+ mr r7, r9
|
||
|
+ mr r11, r0
|
||
|
+ mr r5, r19
|
||
|
+
|
||
|
+L(dwordCopy):
|
||
|
+ std r10, 8(r9) /* copy dword at offset=8 */
|
||
|
+ addi r19, r5, -32
|
||
|
+ addi r0, r11, -4
|
||
|
+ ld r8, 16(r4)
|
||
|
+ cmpb r10, r8, r12
|
||
|
+ cmpdi cr7, r10, 0
|
||
|
+ beq cr7, L(dWordUnroll)
|
||
|
+
|
||
|
+ addi r9, r9, 16 /* increment dst by 16 */
|
||
|
+ addi r4, r4, 16 /* increment src by 16 */
|
||
|
+ addi r5, r5, -16 /* decrement length 'n' by 16 */
|
||
|
+ addi r0, r11, -2 /* decrement loop counter */
|
||
|
+
|
||
|
+L(dWordUnrollOFF):
|
||
|
+ ld r10, 0(r4) /* load first dword */
|
||
|
+ li r8, 0 /* load mask */
|
||
|
+ cmpb r8, r10, r8
|
||
|
+ cmpdi cr7, r8, 0
|
||
|
+ bne cr7, L(byte_by_byte)
|
||
|
+ mtctr r0
|
||
|
+ li r7, 0
|
||
|
+ b L(CopyDword)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(loadDWordandCompare):
|
||
|
+ ld r10, 0(r4)
|
||
|
+ cmpb r8, r10, r7
|
||
|
+ cmpdi cr7, r8, 0
|
||
|
+ bne cr7, L(byte_by_byte)
|
||
|
+
|
||
|
+L(CopyDword):
|
||
|
+ addi r9, r9, 8
|
||
|
+ std r10, -8(r9)
|
||
|
+ addi r4, r4, 8
|
||
|
+ addi r5, r5, -8
|
||
|
+ bdnz L(loadDWordandCompare)
|
||
|
+
|
||
|
+L(byte_by_byte):
|
||
|
+ cmpldi cr7, r5, 3
|
||
|
+ ble cr7, L(verifyByte)
|
||
|
+ srdi r10, r5, 2
|
||
|
+ mr r19, r9
|
||
|
+ mtctr r10
|
||
|
+ b L(firstByteUnroll)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(bytes_unroll):
|
||
|
+ lbz r10, 1(r4) /* load byte from src */
|
||
|
+ cmpdi cr7, r10, 0 /* compare for NULL */
|
||
|
+ stb r10, 1(r19) /* store byte to dst */
|
||
|
+ beq cr7, L(updtDestComputeN2ndByte)
|
||
|
+
|
||
|
+ addi r4, r4, 4 /* advance src */
|
||
|
+
|
||
|
+ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
|
||
|
+ cmpdi cr7, r10, 0
|
||
|
+ stb r10, 2(r19)
|
||
|
+ beq cr7, L(updtDestComputeN3rdByte)
|
||
|
+
|
||
|
+ lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
|
||
|
+ addi r19, r19, 4
|
||
|
+ cmpdi cr7, r10, 0
|
||
|
+ stb r10, -1(r19)
|
||
|
+ beq cr7, L(ComputeNByte)
|
||
|
+
|
||
|
+ bdz L(update0)
|
||
|
+
|
||
|
+L(firstByteUnroll):
|
||
|
+ lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
|
||
|
+ cmpdi cr7, 10, 0
|
||
|
+ stb r10, 0(r19)
|
||
|
+ bne cr7, L(bytes_unroll)
|
||
|
+ addi r19, r19, 1
|
||
|
+
|
||
|
+L(ComputeNByte):
|
||
|
+ subf r9, r19, r9 /* compute 'n'n bytes to fill */
|
||
|
+ add r8, r9, r5
|
||
|
+
|
||
|
+L(zeroFill):
|
||
|
+ cmpdi cr7, r8, 0 /* compare if length is zero */
|
||
|
+ beq cr7, L(update3return)
|
||
|
+
|
||
|
+ mr r3, r19 /* fill buffer with */
|
||
|
+ li r4, 0 /* zero fill buffer */
|
||
|
+ mr r5, r8 /* how many bytes to fill buffer with */
|
||
|
+ bl MEMSET /* call optimized memset */
|
||
|
+ nop
|
||
|
+
|
||
|
+L(update3return):
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ addi r3, r19, -1 /* update return value */
|
||
|
+#endif
|
||
|
+
|
||
|
+L(hop2return):
|
||
|
+#ifndef USE_AS_STPNCPY
|
||
|
+ mr r3, r18 /* set return value */
|
||
|
+#endif
|
||
|
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
|
||
|
+ ld r0, 16(r1) /* read the saved link register */
|
||
|
+ ld r18, -16(r1) /* restore callers save register, r18 */
|
||
|
+ ld r19, -8(r1) /* restore callers save register, r19 */
|
||
|
+ mtlr r0 /* branch to link register */
|
||
|
+ blr /* return */
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(update0):
|
||
|
+ mr r9, r19
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(verifyByte):
|
||
|
+ rldicl. r8, r5, 0, 62
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ mr r3, r9
|
||
|
+#endif
|
||
|
+ beq cr0, L(hop2return)
|
||
|
+ mtctr r8
|
||
|
+ addi r4, r4, -1
|
||
|
+ mr r19, r9
|
||
|
+ b L(oneBYone)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(proceed):
|
||
|
+ bdz L(done)
|
||
|
+
|
||
|
+L(oneBYone):
|
||
|
+ lbzu r10, 1(r4) /* copy byte */
|
||
|
+ addi r19, r19, 1
|
||
|
+ addi r8, r8, -1
|
||
|
+ cmpdi cr7, r10, 0
|
||
|
+ stb r10, -1(r19)
|
||
|
+ bne cr7, L(proceed)
|
||
|
+ b L(zeroFill)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(done):
|
||
|
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ mr r3, r19 /* set the return value */
|
||
|
+#else
|
||
|
+ mr r3, r18 /* set the return value */
|
||
|
+#endif
|
||
|
+ ld r0, 16(r1) /* read the saved link register */
|
||
|
+ ld r18, -16(r1) /* restore callers save register, r18 */
|
||
|
+ ld r19, -8(r1) /* restore callers save register, r19 */
|
||
|
+ mtlr r0 /* branch to link register */
|
||
|
+ blr /* return */
|
||
|
+
|
||
|
+L(update1):
|
||
|
+ mr r0, r11
|
||
|
+ mr r19, r5
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(leftDwords):
|
||
|
+ cmpdi cr7, r0, 0
|
||
|
+ mr r5, r19
|
||
|
+ bne cr7, L(dWordUnrollOFF)
|
||
|
+ b L(byte_by_byte)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(updtDestComputeN2ndByte):
|
||
|
+ addi r19, r19, 2 /* update dst by 2 */
|
||
|
+ subf r9, r19, r9 /* compute distance covered */
|
||
|
+ add r8, r9, r5
|
||
|
+ b L(zeroFill)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(updtDestComputeN3rdByte):
|
||
|
+ addi r19, r19, 3 /* update dst by 3 */
|
||
|
+ subf r9, r19, r9 /* compute distance covered */
|
||
|
+ add r8, r9, r5
|
||
|
+ b L(zeroFill)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(HopBy24):
|
||
|
+ addi r9, r9, 24 /* increment dst by 24 */
|
||
|
+ addi r4, r4, 24 /* increment src by 24 */
|
||
|
+ addi r5, r5, -24 /* decrement length 'n' by 24 */
|
||
|
+ addi r0, r11, -3 /* decrement loop counter */
|
||
|
+ b L(dWordUnrollOFF)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(update2):
|
||
|
+ mr r5, r19
|
||
|
+ b L(dWordUnrollOFF)
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(HopBy40):
|
||
|
+ addi r9, r7, 40 /* increment dst by 40 */
|
||
|
+ addi r4, r6, 40 /* increment src by 40 */
|
||
|
+ addi r5, r5, -40 /* decrement length 'n' by 40 */
|
||
|
+ addi r0, r11, -5 /* decrement loop counter */
|
||
|
+ b L(dWordUnrollOFF)
|
||
|
+
|
||
|
+L(update3):
|
||
|
+ mr r0, r11
|
||
|
+ b L(dWordUnrollOFF)
|
||
|
+
|
||
|
+L(HopBy8):
|
||
|
+ addi r9, r3, 8 /* increment dst by 8 */
|
||
|
+ addi r4, r4, 8 /* increment src by 8 */
|
||
|
+ addi r5, r5, -8 /* decrement length 'n' by 8 */
|
||
|
+ addi r0, r11, -1 /* decrement loop counter */
|
||
|
+ b L(dWordUnrollOFF)
|
||
|
+END(FUNC_NAME)
|
||
|
+#ifndef USE_AS_STPNCPY
|
||
|
+libc_hidden_builtin_def (strncpy)
|
||
|
+#endif
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
|
||
|
new file mode 100644
|
||
|
index 0000000..76a1466
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
|
||
|
@@ -0,0 +1,20 @@
|
||
|
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
|
||
|
+ Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#define USE_AS_STPNCPY
|
||
|
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
|
||
|
new file mode 100644
|
||
|
index 0000000..5fda953
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
|
||
|
@@ -0,0 +1,424 @@
|
||
|
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
|
||
|
+ Copyright (C) 2015 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+# define FUNC_NAME __stpncpy
|
||
|
+#else
|
||
|
+# define FUNC_NAME strncpy
|
||
|
+#endif
|
||
|
+
|
||
|
+/* Implements the function
|
||
|
+
|
||
|
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
|
||
|
+
|
||
|
+ or
|
||
|
+
|
||
|
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
|
||
|
+
|
||
|
+ if USE_AS_STPCPY is defined.
|
||
|
+
|
||
|
+ The implementation uses unaligned doubleword access to avoid specialized
|
||
|
+ code paths depending of data alignment. Although recent powerpc64 uses
|
||
|
+ 64K as default, the page cross handling assumes minimum page size of
|
||
|
+ 4k. */
|
||
|
+
|
||
|
+ .machine power7
|
||
|
+EALIGN (FUNC_NAME, 4, 0)
|
||
|
+
|
||
|
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
|
||
|
+ indicating the page size changes. Basically:
|
||
|
+
|
||
|
+ uint64_t srcin = (uint64_t)src;
|
||
|
+ uint64_t ob = srcin & 4096UL;
|
||
|
+ uint64_t nb = (srcin+15UL) & 4096UL;
|
||
|
+ if (ob ^ nb)
|
||
|
+ goto pagecross; */
|
||
|
+
|
||
|
+ addi r10,r4,16
|
||
|
+ rlwinm r9,r4,0,19,19
|
||
|
+
|
||
|
+ /* Since it is a leaf function, save some non-volatile registers on the
|
||
|
+ protected/red zone. */
|
||
|
+ std r26,-48(r1)
|
||
|
+ std r27,-40(r1)
|
||
|
+
|
||
|
+ rlwinm r8,r10,0,19,19
|
||
|
+
|
||
|
+ std r28,-32(r1)
|
||
|
+ std r29,-24(r1)
|
||
|
+
|
||
|
+ cmpld r7,r9,r8
|
||
|
+
|
||
|
+ std r30,-16(r1)
|
||
|
+ std r31,-8(r1)
|
||
|
+
|
||
|
+ beq cr7,L(unaligned_lt_16)
|
||
|
+ rldicl r9,r4,0,61
|
||
|
+ subfic r8,r9,8
|
||
|
+ cmpld cr7,r5,r8
|
||
|
+ bgt cr7,L(pagecross)
|
||
|
+
|
||
|
+ /* At this points there is 1 to 15 bytes to check and write. Since it could
|
||
|
+ be either from first unaligned 16 bytes access or from bulk copy, the code
|
||
|
+ uses an unrolled byte read/write instead of trying to analyze the cmpb
|
||
|
+ results. */
|
||
|
+L(short_path):
|
||
|
+ mr r9,r3
|
||
|
+L(short_path_1):
|
||
|
+ cmpdi cr7,r5,0
|
||
|
+ beq cr7,L(short_path_loop_end_1)
|
||
|
+L(short_path_2):
|
||
|
+ lbz r10,0(r4)
|
||
|
+ cmpdi cr7,r10,0
|
||
|
+ stb r10,0(r9)
|
||
|
+ beq cr7,L(zero_pad_start_1)
|
||
|
+ cmpdi cr0,r5,1
|
||
|
+ addi r8,r9,1
|
||
|
+ addi r6,r5,-1
|
||
|
+ beq cr0,L(short_path_loop_end_0)
|
||
|
+ lbz r10,1(r4)
|
||
|
+ cmpdi cr7,r10,0
|
||
|
+ stb r10,1(r9)
|
||
|
+ beq cr7,L(zero_pad_start_prepare_1)
|
||
|
+ addi r10,r5,-3
|
||
|
+ b L(short_path_loop_1)
|
||
|
+
|
||
|
+ .align 4
|
||
|
+L(short_path_loop):
|
||
|
+ lbz r8,0(r4)
|
||
|
+ addi r7,r10,-2
|
||
|
+ cmpdi cr5,r8,0
|
||
|
+ stb r8,0(r9)
|
||
|
+ beq cr5,L(zero_pad_start_1)
|
||
|
+ beq r7,L(short_path_loop_end_0)
|
||
|
+ lbz r8,1(r4)
|
||
|
+ cmpdi cr7,r8,0
|
||
|
+ stb r8,1(r9)
|
||
|
+ beq cr7,L(zero_pad_start)
|
||
|
+ mr r10,r7
|
||
|
+L(short_path_loop_1):
|
||
|
+ addic. r5,r5,-2
|
||
|
+ addi r9,r9,2
|
||
|
+ cmpdi cr7,r10,0
|
||
|
+ addi r4,r4,2
|
||
|
+ addi r6,r9,1
|
||
|
+ bne cr0,L(short_path_loop)
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ mr r3,r9
|
||
|
+ b L(short_path_loop_end)
|
||
|
+#endif
|
||
|
+
|
||
|
+L(short_path_loop_end_0):
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ addi r3,r9,1
|
||
|
+ b L(short_path_loop_end)
|
||
|
+#endif
|
||
|
+L(short_path_loop_end_1):
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ mr r3,r9
|
||
|
+#endif
|
||
|
+L(short_path_loop_end):
|
||
|
+ /* Restore non-volatile registers. */
|
||
|
+ ld r26,-48(r1)
|
||
|
+ ld r27,-40(r1)
|
||
|
+ ld r28,-32(r1)
|
||
|
+ ld r29,-24(r1)
|
||
|
+ ld r30,-16(r1)
|
||
|
+ ld r31,-8(r1)
|
||
|
+ blr
|
||
|
+
|
||
|
+ /* This code pads the remainder dest with NULL bytes. The algorithm
|
||
|
+ calculate the remanining size and issues a doubleword unrolled
|
||
|
+ loops followed by a byte a byte set. */
|
||
|
+ .align 4
|
||
|
+L(zero_pad_start):
|
||
|
+ mr r5,r10
|
||
|
+ mr r9,r6
|
||
|
+L(zero_pad_start_1):
|
||
|
+ srdi. r8,r5,r3
|
||
|
+ mr r10,r9
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+ mr r3,r9
|
||
|
+#endif
|
||
|
+ beq- cr0,L(zero_pad_loop_b_start)
|
||
|
+ cmpldi cr7,r8,1
|
||
|
+ li cr7,0
|
||
|
+ std r7,0(r9)
|
||
|
+ beq cr7,L(zero_pad_loop_b_prepare)
|
||
|
+ addic. r8,r8,-2
|
||
|
+ addi r10,r9,r16
|
||
|
+ std r7,8(r9)
|
||
|
+ beq cr0,L(zero_pad_loop_dw_2)
|
||
|
+ std r7,16(r9)
|
||
|
+ li r9,0
|
||
|
+ b L(zero_pad_loop_dw_1)
|
||
|
+
|
||
|
+ .align 4
|
||
|
+L(zero_pad_loop_dw):
|
||
|
+ addi r10,r10,16
|
||
|
+ std r9,-8(r10)
|
||
|
+ beq cr0,L(zero_pad_loop_dw_2)
|
||
|
+ std r9,0(r10)
|
||
|
+L(zero_pad_loop_dw_1):
|
||
|
+ cmpldi cr7,r8,1
|
||
|
+ std r9,0(r10)
|
||
|
+ addic. r8,r8,-2
|
||
|
+ bne cr7,L(zero_pad_loop_dw)
|
||
|
+ addi r10,r10,8
|
||
|
+L(zero_pad_loop_dw_2):
|
||
|
+ rldicl r5,r5,0,61
|
||
|
+L(zero_pad_loop_b_start):
|
||
|
+ cmpdi cr7,r5,0
|
||
|
+ addi r5,r5,-1
|
||
|
+ addi r9,r10,-1
|
||
|
+ add r10,r10,5
|
||
|
+ subf r10,r9,r10
|
||
|
+ li r8,0
|
||
|
+ beq- cr7,L(short_path_loop_end)
|
||
|
+
|
||
|
+ /* Write remaining 1-8 bytes. */
|
||
|
+ .align 4
|
||
|
+ addi r9,r9,1
|
||
|
+ mtocrf 0x1,r10
|
||
|
+ bf 29,4f
|
||
|
+ stw r8,0(r9)
|
||
|
+ addi r9,r9,4
|
||
|
+
|
||
|
+ .align 4
|
||
|
+4: bf 30,2f
|
||
|
+ sth r8,0(r9)
|
||
|
+ addi r9,r9,2
|
||
|
+
|
||
|
+ .align 4
|
||
|
+2: bf 31,1f
|
||
|
+ stb r8,0(r9)
|
||
|
+
|
||
|
+ /* Restore non-volatile registers. */
|
||
|
+1: ld r26,-48(r1)
|
||
|
+ ld r27,-40(r1)
|
||
|
+ ld r28,-32(r1)
|
||
|
+ ld r29,-24(r1)
|
||
|
+ ld r30,-16(r1)
|
||
|
+ ld r31,-8(r1)
|
||
|
+ blr
|
||
|
+
|
||
|
+ /* The common case where [src]+16 will not cross a 4K page boundary.
|
||
|
+ In this case the code fast check the first 16 bytes by using doubleword
|
||
|
+ read/compares and update destiny if neither total size or null byte
|
||
|
+ is found in destiny. */
|
||
|
+ .align 4
|
||
|
+L(unaligned_lt_16):
|
||
|
+ cmpldi cr7,r5,7
|
||
|
+ ble cr7,L(short_path)
|
||
|
+ ld r7,0(r4)
|
||
|
+ li r8,0
|
||
|
+ cmpb r8,r7,r8
|
||
|
+ cmpdi cr7,r8,0
|
||
|
+ bne cr7,L(short_path_prepare_2)
|
||
|
+ addi r6,r5,-8
|
||
|
+ std r7,0(r3)
|
||
|
+ addi r9,r3,r8
|
||
|
+ cmpldi cr7,r6,7
|
||
|
+ addi r7,r4,8
|
||
|
+ ble cr7,L(short_path_prepare_1_1)
|
||
|
+ ld r4,8(r4)
|
||
|
+ cmpb r8,r4,r8
|
||
|
+ cmpdi cr7,r8,0
|
||
|
+ bne cr7,L(short_path_prepare_2_1)
|
||
|
+ std r4,8(r3)
|
||
|
+ addi r29,r3,16
|
||
|
+ addi r5,r5,-16
|
||
|
+ /* Neither the null byte was found or total length was reached,
|
||
|
+ align to 16 bytes and issue a bulk copy/compare. */
|
||
|
+ b L(align_to_16b)
|
||
|
+
|
||
|
+ /* In the case of 4k page boundary cross, the algorithm first align
|
||
|
+ the address to a doubleword, calculate a mask based on alignment
|
||
|
+ to ignore the bytes and continue using doubleword. */
|
||
|
+ .align 4
|
||
|
+L(pagecross):
|
||
|
+ rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
|
||
|
+ li r6,-1 /* MASK = 0xffffffffffffffffUL. */
|
||
|
+ sldi r9,r9,3 /* Calculate padding. */
|
||
|
+ ld r7,0(r11) /* Load doubleword from memory. */
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ sld r9,r6,r9 /* MASK = MASK << padding. */
|
||
|
+#else
|
||
|
+ srd r9,r6,r9 /* MASK = MASK >> padding. */
|
||
|
+#endif
|
||
|
+ orc r9,r7,r9 /* Mask bits that are not part of the
|
||
|
+ string. */
|
||
|
+ li cr7,0
|
||
|
+ cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
|
||
|
+ cmpdi cr7,r9,0
|
||
|
+ bne cr7,L(short_path_prepare_2)
|
||
|
+ subf r8,r8,r5 /* Adjust total length. */
|
||
|
+ cmpldi cr7,r8,8 /* Check if length was reached. */
|
||
|
+ ble cr7,L(short_path_prepare_2)
|
||
|
+
|
||
|
+ /* For next checks we have aligned address, so we check for more
|
||
|
+ three doublewords to make sure we can read 16 unaligned bytes
|
||
|
+ to start the bulk copy with 16 aligned addresses. */
|
||
|
+ ld cr7,8(r11)
|
||
|
+ cmpb r9,r7,r9
|
||
|
+ cmpdi cr7,r9,0
|
||
|
+ bne cr7,L(short_path_prepare_2)
|
||
|
+ addi cr7,r8,-8
|
||
|
+ cmpldi cr7,r7,8
|
||
|
+ ble cr7,L(short_path_prepare_2)
|
||
|
+ ld cr7,16(r11)
|
||
|
+ cmpb r9,r7,r9
|
||
|
+ cmpdi cr7,r9,0
|
||
|
+ bne cr7,L(short_path_prepare_2)
|
||
|
+ addi r8,r8,-16
|
||
|
+ cmpldi r7,r8,8
|
||
|
+ ble cr7,L(short_path_prepare_2)
|
||
|
+ ld r8,24(r11)
|
||
|
+ cmpb r9,r8,r9
|
||
|
+ cmpdi r7,r9,0
|
||
|
+ bne cr7,L(short_path_prepare_2)
|
||
|
+
|
||
|
+ /* No null byte found in the 32 bytes readed and length not reached,
|
||
|
+ read source again using unaligned loads and store them. */
|
||
|
+ ld r9,0(r4)
|
||
|
+ addi r29,r3,16
|
||
|
+ addi r5,r5,-16
|
||
|
+ std r9,0(r3)
|
||
|
+ ld r9,8(r4)
|
||
|
+ std r9,8(r3)
|
||
|
+
|
||
|
+ /* Align source to 16 bytes and adjust destiny and size. */
|
||
|
+L(align_to_16b):
|
||
|
+ rldicl r9,r10,0,60
|
||
|
+ rldicr r28,r10,0,59
|
||
|
+ add r12,r5,r9
|
||
|
+ subf r29,r9,r29
|
||
|
+
|
||
|
+ /* The bulk read/compare/copy loads two doublewords, compare and merge
|
||
|
+ in a single register for speed. This is an attempt to speed up the
|
||
|
+ null-checking process for bigger strings. */
|
||
|
+
|
||
|
+ cmpldi cr7,r12,15
|
||
|
+ ble cr7,L(short_path_prepare_1_2)
|
||
|
+
|
||
|
+ /* Main loop for large sizes, unrolled 2 times to get better use of
|
||
|
+ pipeline. */
|
||
|
+ ld r8,0(28)
|
||
|
+ ld r10,8(28)
|
||
|
+ li r9,0
|
||
|
+ cmpb r7,r8,r9
|
||
|
+ cmpb r9,r10,r9
|
||
|
+ or. r6,r9,r7
|
||
|
+ bne cr0,L(short_path_prepare_2_3)
|
||
|
+ addi r5,r12,-16
|
||
|
+ addi r4,r28,16
|
||
|
+ std r8,0(r29)
|
||
|
+ std r10,8(r29)
|
||
|
+ cmpldi cr7,r5,15
|
||
|
+ addi r9,r29,16
|
||
|
+ ble cr7,L(short_path_1)
|
||
|
+ mr r11,r28
|
||
|
+ mr r6,r29
|
||
|
+ li r30,0
|
||
|
+ subfic r26,r4,48
|
||
|
+ subfic r27,r9,48
|
||
|
+
|
||
|
+ b L(loop_16b)
|
||
|
+
|
||
|
+ .align 4
|
||
|
+L(loop_start):
|
||
|
+ ld r31,0(r11)
|
||
|
+ ld r10,8(r11)
|
||
|
+ cmpb r0,r31,r7
|
||
|
+ cmpb r8,r10,r7
|
||
|
+ or. r7,r0,r8
|
||
|
+ addi r5,r5,-32
|
||
|
+ cmpldi cr7,r5,15
|
||
|
+ add r4,r4,r26
|
||
|
+ add r9,r9,r27
|
||
|
+ bne cr0,L(short_path_prepare_2_2)
|
||
|
+ add r4,r28,r4
|
||
|
+ std r31,0(r6)
|
||
|
+ add r9,r29,r9
|
||
|
+ std r10,8(r6)
|
||
|
+ ble cr7,L(short_path_1)
|
||
|
+
|
||
|
+L(loop_16b):
|
||
|
+ ld r10,16(r11)
|
||
|
+ ld r0,24(r11)
|
||
|
+ cmpb r8,r10,r30
|
||
|
+ cmpb r7,r0,r30
|
||
|
+ or. r7,r8,r7
|
||
|
+ addi r12,r12,-32
|
||
|
+ cmpldi r7,r12,15
|
||
|
+ addi r11,r11,32
|
||
|
+ bne cr0,L(short_path_2)
|
||
|
+ std r10,16(r6)
|
||
|
+ addi r6,r6,32
|
||
|
+ std r0,-8(r6)
|
||
|
+ bgt cr7,L(loop_start)
|
||
|
+
|
||
|
+ mr r5,r12
|
||
|
+ mr r4,r11
|
||
|
+ mr r9,r6
|
||
|
+ b L(short_path_1)
|
||
|
+
|
||
|
+ .align 4
|
||
|
+L(short_path_prepare_1_1):
|
||
|
+ mr r5,r6
|
||
|
+ mr r4,r7
|
||
|
+ b L(short_path_1)
|
||
|
+L(short_path_prepare_1_2):
|
||
|
+ mr r5,r12
|
||
|
+ mr r4,r28
|
||
|
+ mr r9,r29
|
||
|
+ b L(short_path_1)
|
||
|
+L(short_path_prepare_2):
|
||
|
+ mr r9,r3
|
||
|
+ b L(short_path_2)
|
||
|
+L(short_path_prepare_2_1):
|
||
|
+ mr r5,r6
|
||
|
+ mr r4,r7
|
||
|
+ b L(short_path_2)
|
||
|
+L(short_path_prepare_2_2):
|
||
|
+ mr r5,r12
|
||
|
+ mr r4,r11
|
||
|
+ mr r9,r6
|
||
|
+ b L(short_path_2)
|
||
|
+L(short_path_prepare_2_3):
|
||
|
+ mr r5,r12
|
||
|
+ mr r4,r28
|
||
|
+ mr r9,r29
|
||
|
+ b L(short_path_2)
|
||
|
+L(zero_pad_loop_b_prepare):
|
||
|
+ addi r10,r9,8
|
||
|
+ rldicl r5,r5,0,61
|
||
|
+ b L(zero_pad_loop_b_start)
|
||
|
+L(zero_pad_start_prepare_1):
|
||
|
+ mr r5,r6
|
||
|
+ mr r9,r8
|
||
|
+ b L(zero_pad_start_1)
|
||
|
+END (FUNC_NAME)
|
||
|
+
|
||
|
+#ifdef USE_AS_STPNCPY
|
||
|
+libc_hidden_def (__stpncpy)
|
||
|
+#else
|
||
|
+libc_hidden_builtin_def (strncpy)
|
||
|
+#endif
|