This patch provides a compatibility kludge for legacy applications which do not provide proper stack alignment. It cannot be upstreamed. The kludge is active for both i386 and x86-64, but it is actually only needed on i386. In Fedora, the same effect was achieved by disabling multi-arch altogether: https://bugzilla.redhat.com/show_bug.cgi?id=1471427 Further discussion is on this internal bug: https://bugzilla.redhat.com/show_bug.cgi?id=1478332 --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/ifunc-impl-list.c 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/ifunc-impl-list.c 2015-04-20 09:04:11.826569951 -0600 @@ -212,6 +212,8 @@ /* Support sysdeps/x86_64/multiarch/strstr-c.c. */ IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, use_unaligned_strstr (), + __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/init-arch.h 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/init-arch.h 2015-04-20 12:15:17.984742925 -0600 @@ -63,6 +63,34 @@ #else /* __ASSEMBLER__ */ # include +# include +# include +# include + +/* Ugly hack to make it possible to select a strstr and strcasestr + implementation that avoids using the stack for 16-byte aligned + SSE temporaries. Doing so makes it possible to call the functions + with a stack that's not 16-byte aligned as can happen, for example, + as a result of compiling the functions' callers with the GCC + -mpreferred-stack-boubdary=2 or =3 option, or with the ICC + -falign-stack=assume-4-byte option. See rhbz 1150282 for details. + + The ifunc selector uses the unaligned version by default if this + file exists and is accessible. */ +# define ENABLE_STRSTR_UNALIGNED_PATHNAME \ + "/etc/sysconfig/64bit_strstr_via_64bit_strstr_sse2_unaligned" + +static bool __attribute__ ((unused)) +use_unaligned_strstr (void) +{ + struct stat unaligned_strstr_etc_sysconfig_file; + + /* TLS may not have been set up yet, so avoid using stat since it tries to + set errno. */ + return INTERNAL_SYSCALL (stat, , 2, + ENABLE_STRSTR_UNALIGNED_PATHNAME, + &unaligned_strstr_etc_sysconfig_file) == 0; +} enum { --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/Makefile 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/Makefile 2015-04-20 09:04:11.854569626 -0600 @@ -17,7 +17,7 @@ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ - memcmp-ssse3 + memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strcasestr-c.c 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strcasestr-c.c 2015-04-20 09:04:11.861569545 -0600 @@ -12,7 +12,8 @@ #if 1 libc_ifunc (__strcasestr, - HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2); + HAS_SSE4_2 && !use_unaligned_strstr () ? __strcasestr_sse42 : + __strcasestr_sse2); #else libc_ifunc (__strcasestr, 0 ? __strcasestr_sse42 : __strcasestr_sse2); --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strstr-c.c 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strstr-c.c 2015-04-20 09:04:11.866569487 -0600 @@ -34,6 +34,7 @@ #include "string/strstr.c" extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; #include "init-arch.h" @@ -41,7 +42,9 @@ /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2) +libc_ifunc (__libc_strstr, HAS_SSE4_2 ? (use_unaligned_strstr () ? + __strstr_sse2_unaligned : + __strstr_sse42) : __strstr_sse2) #undef strstr strong_alias (__libc_strstr, strstr) --- glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S 1969-12-31 17:00:00.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S 2015-04-20 09:04:11.866569487 -0600 @@ -0,0 +1,374 @@ +/* strstr with unaligned loads + Copyright (C) 2009-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +ENTRY(__strstr_sse2_unaligned) + movzbl (%rsi), %eax + testb %al, %al + je L(empty) + movzbl 1(%rsi), %edx + testb %dl, %dl + je L(strchr) + movd %eax, %xmm1 + movd %edx, %xmm2 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4031, %rax + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm1, %xmm1 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm1, %xmm1 + pshufd $0, %xmm2, %xmm2 + ja L(cross_page) + movdqu (%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 1(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 16(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 17(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %r8d + pmovmskb %xmm0, %eax + salq $16, %rax + orq %rax, %r8 + je L(next_32_bytes) +L(next_pair_index): + bsf %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero1) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found1) + cmpb 2(%rax), %dl + jne L(next_pair) + xorl %edx, %edx + jmp L(pair_loop_start) + + .p2align 4 +L(strchr): + movzbl %al, %esi + jmp __strchr_sse2 + + .p2align 4 +L(pair_loop): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair) +L(pair_loop_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop) +L(found1): + ret +L(zero1): + xorl %eax, %eax + ret + + .p2align 4 +L(next_pair): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index) + + .p2align 4 +L(next_32_bytes): + movdqu 32(%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 33(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 49(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %eax + salq $32, %rax + pmovmskb %xmm0, %r8d + salq $48, %r8 + orq %rax, %r8 + je L(loop_header) +L(next_pair2_index): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero2) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found2) + cmpb 2(%rax), %dl + jne L(next_pair2) + xorl %edx, %edx + jmp L(pair_loop2_start) + + .p2align 4 +L(pair_loop2): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair2) +L(pair_loop2_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop2) +L(found2): + ret + L(zero2): + xorl %eax, %eax + ret +L(empty): + mov %rdi, %rax + ret + + .p2align 4 +L(next_pair2): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair2_index) +L(loop_header): + movq $-512, %r11 + movq %rdi, %r9 + + pxor %xmm7, %xmm7 + andq $-64, %rdi + + .p2align 4 +L(loop): + movdqa 64(%rdi), %xmm3 + movdqu 63(%rdi), %xmm6 + movdqa %xmm3, %xmm0 + pxor %xmm2, %xmm3 + pxor %xmm1, %xmm6 + movdqa 80(%rdi), %xmm10 + por %xmm3, %xmm6 + pminub %xmm10, %xmm0 + movdqu 79(%rdi), %xmm3 + pxor %xmm2, %xmm10 + pxor %xmm1, %xmm3 + movdqa 96(%rdi), %xmm9 + por %xmm10, %xmm3 + pminub %xmm9, %xmm0 + pxor %xmm2, %xmm9 + movdqa 112(%rdi), %xmm8 + addq $64, %rdi + pminub %xmm6, %xmm3 + movdqu 31(%rdi), %xmm4 + pminub %xmm8, %xmm0 + pxor %xmm2, %xmm8 + pxor %xmm1, %xmm4 + por %xmm9, %xmm4 + pminub %xmm4, %xmm3 + movdqu 47(%rdi), %xmm5 + pxor %xmm1, %xmm5 + por %xmm8, %xmm5 + pminub %xmm5, %xmm3 + pminub %xmm3, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + testl %eax, %eax + je L(loop) + pminub (%rdi), %xmm6 + pminub 32(%rdi),%xmm4 + pminub 48(%rdi),%xmm5 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm6, %edx + movdqa 16(%rdi), %xmm8 + pcmpeqb %xmm7, %xmm4 + movdqu 15(%rdi), %xmm0 + pmovmskb %xmm5, %r8d + movdqa %xmm8, %xmm3 + pmovmskb %xmm4, %ecx + pcmpeqb %xmm1,%xmm0 + pcmpeqb %xmm2,%xmm3 + salq $32, %rcx + pcmpeqb %xmm7,%xmm8 + salq $48, %r8 + pminub %xmm0,%xmm3 + orq %rcx, %rdx + por %xmm3,%xmm8 + orq %rdx, %r8 + pmovmskb %xmm8, %eax + salq $16, %rax + orq %rax, %r8 + je L(loop) +L(next_pair_index3): + bsfq %r8, %rcx + addq %rdi, %rcx + cmpb $0, (%rcx) + je L(zero) + xorl %eax, %eax + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(success3) + cmpb 1(%rcx), %dl + jne L(next_pair3) + jmp L(pair_loop_start3) + + .p2align 4 +L(pair_loop3): + addq $1, %rax + cmpb 1(%rcx,%rax), %dl + jne L(next_pair3) +L(pair_loop_start3): + movzbl 3(%rsi,%rax), %edx + testb %dl, %dl + jne L(pair_loop3) +L(success3): + lea -1(%rcx), %rax + ret + + .p2align 4 +L(next_pair3): + addq %rax, %r11 + movq %rdi, %rax + subq %r9, %rax + cmpq %r11, %rax + jl L(switch_strstr) + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index3) + jmp L(loop) + + .p2align 4 +L(switch_strstr): + movq %rdi, %rdi + jmp __strstr_sse2 + + .p2align 4 +L(cross_page): + + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqa (%rax), %xmm3 + movdqu -1(%rax), %xmm4 + movdqa %xmm3, %xmm8 + movdqa 16(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm8 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm7 + pminub %xmm4, %xmm3 + movdqu 15(%rax), %xmm4 + pcmpeqb %xmm0, %xmm7 + por %xmm3, %xmm8 + movdqa %xmm5, %xmm3 + movdqa 32(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm6 + pmovmskb %xmm8, %ecx + pminub %xmm4, %xmm3 + movdqu 31(%rax), %xmm4 + por %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pcmpeqb %xmm0, %xmm6 + movdqa 48(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm7, %r8d + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm5, %xmm0 + pminub %xmm4, %xmm3 + movdqu 47(%rax), %xmm4 + por %xmm3, %xmm6 + movdqa %xmm5, %xmm3 + salq $16, %r8 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm6, %r10d + pminub %xmm4, %xmm3 + por %xmm3, %xmm0 + salq $32, %r10 + orq %r10, %r8 + orq %rcx, %r8 + movl %edi, %ecx + pmovmskb %xmm0, %edx + subl %eax, %ecx + salq $48, %rdx + orq %rdx, %r8 + shrq %cl, %r8 + je L(loop_header) +L(next_pair_index4): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero) + + cmpq %rax,%rdi + je L(next_pair4) + + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found3) + cmpb 1(%rax), %dl + jne L(next_pair4) + xorl %edx, %edx + jmp L(pair_loop_start4) + + .p2align 4 +L(pair_loop4): + addq $1, %rdx + cmpb 1(%rax,%rdx), %cl + jne L(next_pair4) +L(pair_loop_start4): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop4) +L(found3): + subq $1, %rax + ret + + .p2align 4 +L(next_pair4): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index4) + jmp L(loop_header) + + .p2align 4 +L(found): + rep + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + +END(__strstr_sse2_unaligned) --- glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/strstr-c.c 2012-12-24 20:02:13.000000000 -0700 +++ glibc-2.17-c758a686/sysdeps/i386/i686/multiarch/strstr-c.c 2015-04-20 09:04:11.876569371 -0600 @@ -1,8 +1,6 @@ /* Multiple versions of strstr All versions must be listed in ifunc-impl-list.c. */ -#include "init-arch.h" - #define STRSTR __strstr_ia32 #if defined SHARED && defined DO_VERSIONING && !defined NO_HIDDEN #undef libc_hidden_builtin_def @@ -17,13 +15,17 @@ #include "string/strstr.c" +#include "init-arch.h" + extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden; extern __typeof (__redirect_strstr) __strstr_ia32 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_ia32) +libc_ifunc (__libc_strstr, + HAS_SSE4_2 && !use_unaligned_strstr () ? + __strstr_sse42 : __strstr_ia32) #undef strstr strong_alias (__libc_strstr, strstr)