You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
445 lines
16 KiB
445 lines
16 KiB
--- gcc/config/aarch64/aarch64.c |
|
+++ gcc/config/aarch64/aarch64.c |
|
@@ -3799,7 +3799,14 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) |
|
output_asm_insn ("sub\t%0, %0, %1", xops); |
|
|
|
/* Probe at TEST_ADDR. */ |
|
- output_asm_insn ("str\txzr, [%0]", xops); |
|
+ if (flag_stack_clash_protection) |
|
+ { |
|
+ gcc_assert (xops[0] == stack_pointer_rtx); |
|
+ xops[1] = GEN_INT (PROBE_INTERVAL - 8); |
|
+ output_asm_insn ("str\txzr, [%0, %1]", xops); |
|
+ } |
|
+ else |
|
+ output_asm_insn ("str\txzr, [%0]", xops); |
|
|
|
/* Test if TEST_ADDR == LAST_ADDR. */ |
|
xops[1] = reg2; |
|
@@ -4589,6 +4596,133 @@ aarch64_set_handled_components (sbitmap components) |
|
cfun->machine->reg_is_wrapped_separately[regno] = true; |
|
} |
|
|
|
+/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch |
|
+ registers. */ |
|
+ |
|
+static void |
|
+aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, |
|
+ poly_int64 poly_size) |
|
+{ |
|
+ HOST_WIDE_INT size; |
|
+ if (!poly_size.is_constant (&size)) |
|
+ { |
|
+ sorry ("stack probes for SVE frames"); |
|
+ return; |
|
+ } |
|
+ |
|
+ HOST_WIDE_INT probe_interval |
|
+ = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); |
|
+ HOST_WIDE_INT guard_size |
|
+ = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); |
|
+ HOST_WIDE_INT guard_used_by_caller = 1024; |
|
+ |
|
+ /* SIZE should be large enough to require probing here. ie, it |
|
+ must be larger than GUARD_SIZE - GUARD_USED_BY_CALLER. |
|
+ |
|
+ We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk |
|
+ without any probing. */ |
|
+ gcc_assert (size >= guard_size - guard_used_by_caller); |
|
+ aarch64_sub_sp (temp1, temp2, guard_size - guard_used_by_caller, true); |
|
+ HOST_WIDE_INT orig_size = size; |
|
+ size -= (guard_size - guard_used_by_caller); |
|
+ |
|
+ HOST_WIDE_INT rounded_size = size & -probe_interval; |
|
+ HOST_WIDE_INT residual = size - rounded_size; |
|
+ |
|
+ /* We can handle a small number of allocations/probes inline. Otherwise |
|
+ punt to a loop. */ |
|
+ if (rounded_size && rounded_size <= 4 * probe_interval) |
|
+ { |
|
+ /* We don't use aarch64_sub_sp here because we don't want to |
|
+ repeatedly load TEMP1. */ |
|
+ rtx step = GEN_INT (-probe_interval); |
|
+ if (probe_interval > ARITH_FACTOR) |
|
+ { |
|
+ emit_move_insn (temp1, step); |
|
+ step = temp1; |
|
+ } |
|
+ |
|
+ for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval) |
|
+ { |
|
+ rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx, step)); |
|
+ add_reg_note (insn, REG_STACK_CHECK, const0_rtx); |
|
+ |
|
+ if (probe_interval > ARITH_FACTOR) |
|
+ { |
|
+ RTX_FRAME_RELATED_P (insn) = 1; |
|
+ rtx adj = plus_constant (Pmode, stack_pointer_rtx, -probe_interval); |
|
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, |
|
+ gen_rtx_SET (stack_pointer_rtx, adj)); |
|
+ } |
|
+ |
|
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, |
|
+ (probe_interval |
|
+ - GET_MODE_SIZE (word_mode)))); |
|
+ emit_insn (gen_blockage ()); |
|
+ } |
|
+ dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); |
|
+ } |
|
+ else if (rounded_size) |
|
+ { |
|
+ /* Compute the ending address. */ |
|
+ unsigned int scratchreg = REGNO (temp1); |
|
+ emit_move_insn (temp1, GEN_INT (-rounded_size)); |
|
+ rtx_insn *insn |
|
+ = emit_insn (gen_add3_insn (temp1, stack_pointer_rtx, temp1)); |
|
+ |
|
+ /* For the initial allocation, we don't have a frame pointer |
|
+ set up, so we always need CFI notes. If we're doing the |
|
+ final allocation, then we may have a frame pointer, in which |
|
+ case it is the CFA, otherwise we need CFI notes. |
|
+ |
|
+ We can determine which allocation we are doing by looking at |
|
+ the temporary register. IP0 is the initial allocation, IP1 |
|
+ is the final allocation. */ |
|
+ if (scratchreg == IP0_REGNUM || !frame_pointer_needed) |
|
+ { |
|
+ /* We want the CFA independent of the stack pointer for the |
|
+ duration of the loop. */ |
|
+ add_reg_note (insn, REG_CFA_DEF_CFA, |
|
+ plus_constant (Pmode, temp1, |
|
+ (rounded_size + (orig_size - size)))); |
|
+ RTX_FRAME_RELATED_P (insn) = 1; |
|
+ } |
|
+ |
|
+ /* This allocates and probes the stack. |
|
+ |
|
+ It also probes at a 4k interval regardless of the value of |
|
+ PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL. */ |
|
+ insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, |
|
+ stack_pointer_rtx, temp1)); |
|
+ |
|
+ /* Now reset the CFA register if needed. */ |
|
+ if (scratchreg == IP0_REGNUM || !frame_pointer_needed) |
|
+ { |
|
+ add_reg_note (insn, REG_CFA_DEF_CFA, |
|
+ plus_constant (Pmode, stack_pointer_rtx, |
|
+ (rounded_size + (orig_size - size)))); |
|
+ RTX_FRAME_RELATED_P (insn) = 1; |
|
+ } |
|
+ |
|
+ emit_insn (gen_blockage ()); |
|
+ dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); |
|
+ } |
|
+ else |
|
+ dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); |
|
+ |
|
+ /* Handle any residuals. |
|
+ Note that any residual must be probed. */ |
|
+ if (residual) |
|
+ { |
|
+ aarch64_sub_sp (temp1, temp2, residual, true); |
|
+ add_reg_note (get_last_insn (), REG_STACK_CHECK, const0_rtx); |
|
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, |
|
+ (residual - GET_MODE_SIZE (word_mode)))); |
|
+ emit_insn (gen_blockage ()); |
|
+ } |
|
+ return; |
|
+} |
|
+ |
|
/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG |
|
is saved at BASE + OFFSET. */ |
|
|
|
@@ -4686,7 +4820,54 @@ aarch64_expand_prologue (void) |
|
rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM); |
|
rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM); |
|
|
|
- aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true); |
|
+ /* We do not fully protect aarch64 against stack clash style attacks |
|
+ as doing so would be prohibitively expensive with less utility over |
|
+ time as newer compilers are deployed. |
|
+ |
|
+ We assume the guard is at least 64k. Furthermore, we assume that |
|
+ the caller has not pushed the stack pointer more than 1k into |
|
+ the guard. A caller that pushes the stack pointer than 1k into |
|
+ the guard is considered invalid. |
|
+ |
|
+ Note that the caller's ability to push the stack pointer into the |
|
+ guard is a function of the number and size of outgoing arguments and/or |
|
+ dynamic stack allocations due to the mandatory save of the link register |
|
+ in the caller's frame. |
|
+ |
|
+ With those assumptions the callee can allocate up to 63k of stack |
|
+ space without probing. |
|
+ |
|
+ When probing is needed, we emit a probe at the start of the prologue |
|
+ and every PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes thereafter. |
|
+ |
|
+ We have to track how much space has been allocated, but we do not |
|
+ track stores into the stack as implicit probes except for the |
|
+ fp/lr store. */ |
|
+ HOST_WIDE_INT guard_size |
|
+ = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); |
|
+ HOST_WIDE_INT guard_used_by_caller = 1024; |
|
+ if (flag_stack_clash_protection) |
|
+ { |
|
+ if (known_eq (frame_size, 0)) |
|
+ dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); |
|
+ else if (known_lt (initial_adjust, guard_size - guard_used_by_caller) |
|
+ && known_lt (final_adjust, guard_size - guard_used_by_caller)) |
|
+ dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); |
|
+ } |
|
+ |
|
+ /* In theory we should never have both an initial adjustment |
|
+ and a callee save adjustment. Verify that is the case since the |
|
+ code below does not handle it for -fstack-clash-protection. */ |
|
+ gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0); |
|
+ |
|
+ /* Only probe if the initial adjustment is larger than the guard |
|
+ less the amount of the guard reserved for use by the caller's |
|
+ outgoing args. */ |
|
+ if (flag_stack_clash_protection |
|
+ && maybe_ge (initial_adjust, guard_size - guard_used_by_caller)) |
|
+ aarch64_allocate_and_probe_stack_space (ip0_rtx, ip1_rtx, initial_adjust); |
|
+ else |
|
+ aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true); |
|
|
|
if (callee_adjust != 0) |
|
aarch64_push_regs (reg1, reg2, callee_adjust); |
|
@@ -4742,7 +4923,31 @@ aarch64_expand_prologue (void) |
|
callee_adjust != 0 || emit_frame_chain); |
|
aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, |
|
callee_adjust != 0 || emit_frame_chain); |
|
- aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed); |
|
+ |
|
+ /* We may need to probe the final adjustment as well. */ |
|
+ if (flag_stack_clash_protection && maybe_ne (final_adjust, 0)) |
|
+ { |
|
+ /* First probe if the final adjustment is larger than the guard size |
|
+ less the amount of the guard reserved for use by the caller's |
|
+ outgoing args. */ |
|
+ if (maybe_ge (final_adjust, guard_size - guard_used_by_caller)) |
|
+ aarch64_allocate_and_probe_stack_space (ip1_rtx, ip0_rtx, |
|
+ final_adjust); |
|
+ else |
|
+ aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed); |
|
+ |
|
+ /* We must also probe if the final adjustment is larger than the guard |
|
+ that is assumed used by the caller. This may be sub-optimal. */ |
|
+ if (maybe_ge (final_adjust, guard_used_by_caller)) |
|
+ { |
|
+ if (dump_file) |
|
+ fprintf (dump_file, |
|
+ "Stack clash aarch64 large outgoing arg, probing\n"); |
|
+ emit_stack_probe (stack_pointer_rtx); |
|
+ } |
|
+ } |
|
+ else |
|
+ aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed); |
|
} |
|
|
|
/* Return TRUE if we can use a simple_return insn. |
|
@@ -10476,6 +10681,12 @@ aarch64_override_options_internal (struct gcc_options *opts) |
|
&& opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) |
|
opts->x_flag_prefetch_loop_arrays = 1; |
|
|
|
+ /* We assume the guard page is 64k. */ |
|
+ maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE, |
|
+ 16, |
|
+ opts->x_param_values, |
|
+ global_options_set.x_param_values); |
|
+ |
|
aarch64_override_options_after_change_1 (opts); |
|
} |
|
|
|
@@ -17161,6 +17372,28 @@ aarch64_sched_can_speculate_insn (rtx_insn *insn) |
|
} |
|
} |
|
|
|
+/* It has been decided that to allow up to 1kb of outgoing argument |
|
+ space to be allocated w/o probing. If more than 1kb of outgoing |
|
+ argment space is allocated, then it must be probed and the last |
|
+ probe must occur no more than 1kbyte away from the end of the |
|
+ allocated space. |
|
+ |
|
+ This implies that the residual part of an alloca allocation may |
|
+ need probing in cases where the generic code might not otherwise |
|
+ think a probe is needed. |
|
+ |
|
+ This target hook returns TRUE when allocating RESIDUAL bytes of |
|
+ alloca space requires an additional probe, otherwise FALSE is |
|
+ returned. */ |
|
+ |
|
+static bool |
|
+aarch64_stack_clash_protection_final_dynamic_probe (rtx residual) |
|
+{ |
|
+ return (residual == CONST0_RTX (Pmode) |
|
+ || GET_CODE (residual) != CONST_INT |
|
+ || INTVAL (residual) >= 1024); |
|
+} |
|
+ |
|
/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */ |
|
|
|
static int |
|
@@ -17669,6 +17902,10 @@ aarch64_libgcc_floating_mode_supported_p |
|
#undef TARGET_CONSTANT_ALIGNMENT |
|
#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment |
|
|
|
+#undef TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE |
|
+#define TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE \ |
|
+ aarch64_stack_clash_protection_final_dynamic_probe |
|
+ |
|
#undef TARGET_COMPUTE_PRESSURE_CLASSES |
|
#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes |
|
|
|
--- gcc/config/aarch64/aarch64.md |
|
+++ gcc/config/aarch64/aarch64.md |
|
@@ -5812,7 +5812,7 @@ |
|
) |
|
|
|
(define_insn "probe_stack_range" |
|
- [(set (match_operand:DI 0 "register_operand" "=r") |
|
+ [(set (match_operand:DI 0 "register_operand" "=rk") |
|
(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0") |
|
(match_operand:DI 2 "register_operand" "r")] |
|
UNSPECV_PROBE_STACK_RANGE))] |
|
--- gcc/testsuite/gcc.target/aarch64/stack-check-12.c |
|
+++ gcc/testsuite/gcc.target/aarch64/stack-check-12.c |
|
@@ -0,0 +1,20 @@ |
|
+/* { dg-do compile } */ |
|
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ |
|
+/* { dg-require-effective-target supports_stack_clash_protection } */ |
|
+ |
|
+extern void arf (unsigned long int *, unsigned long int *); |
|
+void |
|
+frob () |
|
+{ |
|
+ unsigned long int num[1000]; |
|
+ unsigned long int den[1000]; |
|
+ arf (den, num); |
|
+} |
|
+ |
|
+/* This verifies that the scheduler did not break the dependencies |
|
+ by adjusting the offsets within the probe and that the scheduler |
|
+ did not reorder around the stack probes. */ |
|
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #4096\\n\\tstr\\txzr, .sp, 4088." 3 } } */ |
|
+ |
|
+ |
|
+ |
|
--- gcc/testsuite/gcc.target/aarch64/stack-check-13.c |
|
+++ gcc/testsuite/gcc.target/aarch64/stack-check-13.c |
|
@@ -0,0 +1,28 @@ |
|
+/* { dg-do compile } */ |
|
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ |
|
+/* { dg-require-effective-target supports_stack_clash_protection } */ |
|
+ |
|
+#define ARG32(X) X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X |
|
+#define ARG192(X) ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X) |
|
+void out1(ARG192(__int128)); |
|
+int t1(int); |
|
+ |
|
+int t3(int x) |
|
+{ |
|
+ if (x < 1000) |
|
+ return t1 (x) + 1; |
|
+ |
|
+ out1 (ARG192(1)); |
|
+ return 0; |
|
+} |
|
+ |
|
+ |
|
+ |
|
+/* This test creates a large (> 1k) outgoing argument area that needs |
|
+ to be probed. We don't test the exact size of the space or the |
|
+ exact offset to make the test a little less sensitive to trivial |
|
+ output changes. */ |
|
+/* { dg-final { scan-assembler-times "sub\\tsp, sp, #....\\n\\tstr\\txzr, \\\[sp" 1 } } */ |
|
+ |
|
+ |
|
+ |
|
--- gcc/testsuite/gcc.target/aarch64/stack-check-14.c |
|
+++ gcc/testsuite/gcc.target/aarch64/stack-check-14.c |
|
@@ -0,0 +1,25 @@ |
|
+/* { dg-do compile } */ |
|
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ |
|
+/* { dg-require-effective-target supports_stack_clash_protection } */ |
|
+ |
|
+int t1(int); |
|
+ |
|
+int t2(int x) |
|
+{ |
|
+ char *p = __builtin_alloca (4050); |
|
+ x = t1 (x); |
|
+ return p[x]; |
|
+} |
|
+ |
|
+ |
|
+/* This test has a constant sized alloca that is smaller than the |
|
+ probe interval. But it actually requires two probes instead |
|
+ of one because of the optimistic assumptions we made in the |
|
+ aarch64 prologue code WRT probing state. |
|
+ |
|
+ The form can change quite a bit so we just check for two |
|
+ probes without looking at the actual address. */ |
|
+/* { dg-final { scan-assembler-times "str\\txzr," 2 } } */ |
|
+ |
|
+ |
|
+ |
|
--- gcc/testsuite/gcc.target/aarch64/stack-check-15.c |
|
+++ gcc/testsuite/gcc.target/aarch64/stack-check-15.c |
|
@@ -0,0 +1,24 @@ |
|
+/* { dg-do compile } */ |
|
+/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ |
|
+/* { dg-require-effective-target supports_stack_clash_protection } */ |
|
+ |
|
+int t1(int); |
|
+ |
|
+int t2(int x) |
|
+{ |
|
+ char *p = __builtin_alloca (x); |
|
+ x = t1 (x); |
|
+ return p[x]; |
|
+} |
|
+ |
|
+ |
|
+/* This test has a variable sized alloca. It requires 3 probes. |
|
+ One in the loop, one for the residual and at the end of the |
|
+ alloca area. |
|
+ |
|
+ The form can change quite a bit so we just check for two |
|
+ probes without looking at the actual address. */ |
|
+/* { dg-final { scan-assembler-times "str\\txzr," 3 } } */ |
|
+ |
|
+ |
|
+ |
|
--- gcc/testsuite/lib/target-supports.exp |
|
+++ gcc/testsuite/lib/target-supports.exp |
|
@@ -9201,14 +9201,9 @@ proc check_effective_target_autoincdec { } { |
|
# |
|
proc check_effective_target_supports_stack_clash_protection { } { |
|
|
|
- # Temporary until the target bits are fully ACK'd. |
|
-# if { [istarget aarch*-*-*] } { |
|
-# return 1 |
|
-# } |
|
- |
|
if { [istarget x86_64-*-*] || [istarget i?86-*-*] |
|
|| [istarget powerpc*-*-*] || [istarget rs6000*-*-*] |
|
- || [istarget s390*-*-*] } { |
|
+ || [istarget aarch64*-**] || [istarget s390*-*-*] } { |
|
return 1 |
|
} |
|
return 0 |
|
@@ -9217,9 +9212,9 @@ proc check_effective_target_supports_stack_clash_protection { } { |
|
# Return 1 if the target creates a frame pointer for non-leaf functions |
|
# Note we ignore cases where we apply tail call optimization here. |
|
proc check_effective_target_frame_pointer_for_non_leaf { } { |
|
- if { [istarget aarch*-*-*] } { |
|
- return 1 |
|
- } |
|
+# if { [istarget aarch*-*-*] } { |
|
+# return 1 |
|
+# } |
|
|
|
# Solaris/x86 defaults to -fno-omit-frame-pointer. |
|
if { [istarget i?86-*-solaris*] || [istarget x86_64-*-solaris*] } {
|
|
|