You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
435 lines
15 KiB
435 lines
15 KiB
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
|
From: Daniel Axtens <dja@axtens.net> |
|
Date: Mon, 6 Feb 2023 10:03:22 -0500 |
|
Subject: [PATCH] ieee1275: support runtime memory claiming |
|
|
|
On powerpc-ieee1275, we are running out of memory trying to verify |
|
anything. This is because: |
|
|
|
- we have to load an entire file into memory to verify it. This is |
|
difficult to change with appended signatures. |
|
- We only have 32MB of heap. |
|
- Distro kernels are now often around 30MB. |
|
|
|
So we want to be able to claim more memory from OpenFirmware for our heap |
|
at runtime. |
|
|
|
There are some complications: |
|
|
|
- The grub mm code isn't the only thing that will make claims on |
|
memory from OpenFirmware: |
|
|
|
* PFW/SLOF will have claimed some for their own use. |
|
|
|
* The ieee1275 loader will try to find other bits of memory that we |
|
haven't claimed to place the kernel and initrd when we go to boot. |
|
|
|
* Once we load Linux, it will also try to claim memory. It claims |
|
memory without any reference to /memory/available, it just starts |
|
at min(top of RMO, 768MB) and works down. So we need to avoid this |
|
area. See arch/powerpc/kernel/prom_init.c as of v5.11. |
|
|
|
- The smallest amount of memory a ppc64 KVM guest can have is 256MB. |
|
It doesn't work with distro kernels but can work with custom kernels. |
|
We should maintain support for that. (ppc32 can boot with even less, |
|
and we shouldn't break that either.) |
|
|
|
- Even if a VM has more memory, the memory OpenFirmware makes available |
|
as Real Memory Area can be restricted. Even with our CAS work, an LPAR |
|
on a PowerVM box is likely to have only 512MB available to OpenFirmware |
|
even if it has many gigabytes of memory allocated. |
|
|
|
What should we do? |
|
|
|
We don't know in advance how big the kernel and initrd are going to be, |
|
which makes figuring out how much memory we can take a bit tricky. |
|
|
|
To figure out how much memory we should leave unused, I looked at: |
|
|
|
- an Ubuntu 20.04.1 ppc64le pseries KVM guest: |
|
vmlinux: ~30MB |
|
initrd: ~50MB |
|
|
|
- a RHEL8.2 ppc64le pseries KVM guest: |
|
vmlinux: ~30MB |
|
initrd: ~30MB |
|
|
|
So to give us a little wriggle room, I think we want to leave at least |
|
128MB for the loader to put vmlinux and initrd in memory and leave Linux |
|
with space to satisfy its early allocations. |
|
|
|
Allow other space to be allocated at runtime. |
|
|
|
Tested-by: Stefan Berger <stefanb@linux.ibm.com> |
|
Signed-off-by: Daniel Axtens <dja@axtens.net> |
|
(cherry picked from commit a5c710789ccdd27a84ae4a34c7d453bd585e2b66) |
|
[rharwood: _start?] |
|
--- |
|
grub-core/kern/ieee1275/init.c | 270 ++++++++++++++++++++++++++++++++++++++--- |
|
docs/grub-dev.texi | 7 +- |
|
2 files changed, 257 insertions(+), 20 deletions(-) |
|
|
|
diff --git a/grub-core/kern/ieee1275/init.c b/grub-core/kern/ieee1275/init.c |
|
index c8d551759d..85af8fa97b 100644 |
|
--- a/grub-core/kern/ieee1275/init.c |
|
+++ b/grub-core/kern/ieee1275/init.c |
|
@@ -46,13 +46,26 @@ |
|
#endif |
|
#include <grub/lockdown.h> |
|
|
|
-/* The maximum heap size we're going to claim */ |
|
+/* The maximum heap size we're going to claim at boot. Not used by sparc. */ |
|
#ifdef __i386__ |
|
#define HEAP_MAX_SIZE (unsigned long) (64 * 1024 * 1024) |
|
-#else |
|
+#else /* __powerpc__ */ |
|
#define HEAP_MAX_SIZE (unsigned long) (32 * 1024 * 1024) |
|
#endif |
|
|
|
+/* RMO max. address at 768 MB */ |
|
+#define RMO_ADDR_MAX (grub_uint64_t) (768 * 1024 * 1024) |
|
+ |
|
+/* |
|
+ * The amount of OF space we will not claim here so as to leave space for |
|
+ * the loader and linux to service early allocations. |
|
+ * |
|
+ * In 2021, Daniel Axtens claims that we should leave at least 128MB to |
|
+ * ensure we can load a stock kernel and initrd on a pseries guest with |
|
+ * a 512MB real memory area under PowerVM. |
|
+ */ |
|
+#define RUNTIME_MIN_SPACE (128UL * 1024 * 1024) |
|
+ |
|
extern char _end[]; |
|
|
|
#ifdef __sparc__ |
|
@@ -147,16 +160,52 @@ grub_claim_heap (void) |
|
+ GRUB_KERNEL_MACHINE_STACK_SIZE), 0x200000); |
|
} |
|
#else |
|
-/* Helper for grub_claim_heap. */ |
|
+/* Helpers for mm on powerpc. */ |
|
+ |
|
+/* |
|
+ * How much memory does OF believe exists in total? |
|
+ * |
|
+ * This isn't necessarily the true total. It can be the total memory |
|
+ * accessible in real mode for a pseries guest, for example. |
|
+ */ |
|
+static grub_uint64_t rmo_top; |
|
+ |
|
static int |
|
-heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
- void *data) |
|
+count_free (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
+ void *data) |
|
{ |
|
- unsigned long *total = data; |
|
+ if (type != GRUB_MEMORY_AVAILABLE) |
|
+ return 0; |
|
+ |
|
+ /* Do not consider memory beyond 4GB */ |
|
+ if (addr > 0xffffffffULL) |
|
+ return 0; |
|
+ |
|
+ if (addr + len > 0xffffffffULL) |
|
+ len = 0xffffffffULL - addr; |
|
+ |
|
+ *(grub_uint32_t *) data += len; |
|
+ |
|
+ return 0; |
|
+} |
|
+ |
|
+static int |
|
+regions_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
+ unsigned int flags, void *data) |
|
+{ |
|
+ grub_uint32_t total = *(grub_uint32_t *) data; |
|
+ grub_uint64_t linux_rmo_save; |
|
|
|
if (type != GRUB_MEMORY_AVAILABLE) |
|
return 0; |
|
|
|
+ /* Do not consider memory beyond 4GB */ |
|
+ if (addr > 0xffffffffULL) |
|
+ return 0; |
|
+ |
|
+ if (addr + len > 0xffffffffULL) |
|
+ len = 0xffffffffULL - addr; |
|
+ |
|
if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_NO_PRE1_5M_CLAIM)) |
|
{ |
|
if (addr + len <= 0x180000) |
|
@@ -169,10 +218,6 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
} |
|
} |
|
|
|
- /* Never exceed HEAP_MAX_SIZE */ |
|
- if (*total + len > HEAP_MAX_SIZE) |
|
- len = HEAP_MAX_SIZE - *total; |
|
- |
|
/* In theory, firmware should already prevent this from happening by not |
|
listing our own image in /memory/available. The check below is intended |
|
as a safeguard in case that doesn't happen. However, it doesn't protect |
|
@@ -184,6 +229,108 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
len = 0; |
|
} |
|
|
|
+ /* |
|
+ * Linux likes to claim memory at min(RMO top, 768MB) and works down |
|
+ * without reference to /memory/available. (See prom_init.c::alloc_down) |
|
+ * |
|
+ * If this block contains min(RMO top, 768MB), do not claim below that for |
|
+ * at least a few MB (this is where RTAS, SML and potentially TCEs live). |
|
+ * |
|
+ * We also need to leave enough space for the DT in the RMA. (See |
|
+ * prom_init.c::alloc_up) |
|
+ * |
|
+ * Finally, we also want to make sure that when grub loads the kernel, |
|
+ * it isn't going to use up all the memory we're trying to reserve! So |
|
+ * enforce our entire RUNTIME_MIN_SPACE here: |
|
+ * |
|
+ * |---------- Top of memory ----------| |
|
+ * | | |
|
+ * | available | |
|
+ * | | |
|
+ * |---------- 768 MB ----------| |
|
+ * | | |
|
+ * | reserved | |
|
+ * | | |
|
+ * |--- 768 MB - runtime min space ---| |
|
+ * | | |
|
+ * | available | |
|
+ * | | |
|
+ * |---------- 0 MB ----------| |
|
+ * |
|
+ * Edge cases: |
|
+ * |
|
+ * - Total memory less than RUNTIME_MIN_SPACE: only claim up to HEAP_MAX_SIZE. |
|
+ * (enforced elsewhere) |
|
+ * |
|
+ * - Total memory between RUNTIME_MIN_SPACE and 768MB: |
|
+ * |
|
+ * |---------- Top of memory ----------| |
|
+ * | | |
|
+ * | reserved | |
|
+ * | | |
|
+ * |---- top - runtime min space ----| |
|
+ * | | |
|
+ * | available | |
|
+ * | | |
|
+ * |---------- 0 MB ----------| |
|
+ * |
|
+ * This by itself would not leave us with RUNTIME_MIN_SPACE of free bytes: if |
|
+ * rmo_top < 768MB, we will almost certainly have FW claims in the reserved |
|
+ * region. We try to address that elsewhere: grub_ieee1275_mm_add_region will |
|
+ * not call us if the resulting free space would be less than RUNTIME_MIN_SPACE. |
|
+ */ |
|
+ linux_rmo_save = grub_min (RMO_ADDR_MAX, rmo_top) - RUNTIME_MIN_SPACE; |
|
+ if (rmo_top > RUNTIME_MIN_SPACE) |
|
+ { |
|
+ if (rmo_top <= RMO_ADDR_MAX) |
|
+ { |
|
+ if (addr > linux_rmo_save) |
|
+ { |
|
+ grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n", |
|
+ addr); |
|
+ return 0; |
|
+ } |
|
+ else if (addr + len > linux_rmo_save) |
|
+ { |
|
+ grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n", |
|
+ addr, addr + len, addr, rmo_top - RUNTIME_MIN_SPACE); |
|
+ len = linux_rmo_save - addr; |
|
+ } |
|
+ } |
|
+ else |
|
+ { |
|
+ /* |
|
+ * we order these cases to prefer higher addresses and avoid some |
|
+ * splitting issues |
|
+ */ |
|
+ if (addr < RMO_ADDR_MAX && (addr + len) > RMO_ADDR_MAX) |
|
+ { |
|
+ grub_dprintf ("ieee1275", |
|
+ "adjusting region for RUNTIME_MIN_SPACE: (%llx -> %llx) -> (%llx -> %llx)\n", |
|
+ addr, addr + len, RMO_ADDR_MAX, addr + len); |
|
+ len = (addr + len) - RMO_ADDR_MAX; |
|
+ addr = RMO_ADDR_MAX; |
|
+ } |
|
+ else if ((addr < linux_rmo_save) && ((addr + len) > linux_rmo_save)) |
|
+ { |
|
+ grub_dprintf ("ieee1275", "capping region: (%llx -> %llx) -> (%llx -> %llx)\n", |
|
+ addr, addr + len, addr, linux_rmo_save); |
|
+ len = linux_rmo_save - addr; |
|
+ } |
|
+ else if (addr >= linux_rmo_save && (addr + len) <= RMO_ADDR_MAX) |
|
+ { |
|
+ grub_dprintf ("ieee1275", "rejecting region in RUNTIME_MIN_SPACE reservation (%llx)\n", |
|
+ addr); |
|
+ return 0; |
|
+ } |
|
+ } |
|
+ } |
|
+ if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE && len < total) |
|
+ return 0; |
|
+ |
|
+ if (len > total) |
|
+ len = total; |
|
+ |
|
if (len) |
|
{ |
|
grub_err_t err; |
|
@@ -192,15 +339,95 @@ heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
if (err) |
|
return err; |
|
grub_mm_init_region ((void *) (grub_addr_t) addr, len); |
|
+ total -= len; |
|
} |
|
|
|
- *total += len; |
|
- if (*total >= HEAP_MAX_SIZE) |
|
+ *(grub_uint32_t *) data = total; |
|
+ |
|
+ if (total == 0) |
|
return 1; |
|
|
|
return 0; |
|
} |
|
|
|
+static int |
|
+heap_init (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
+ void *data) |
|
+{ |
|
+ return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_NONE, data); |
|
+} |
|
+ |
|
+static int |
|
+region_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type, |
|
+ void *data) |
|
+{ |
|
+ return regions_claim (addr, len, type, GRUB_MM_ADD_REGION_CONSECUTIVE, data); |
|
+} |
|
+ |
|
+static grub_err_t |
|
+grub_ieee1275_mm_add_region (grub_size_t size, unsigned int flags) |
|
+{ |
|
+ grub_uint32_t free_memory = 0; |
|
+ grub_uint32_t avail = 0; |
|
+ grub_uint32_t total; |
|
+ |
|
+ grub_dprintf ("ieee1275", "mm requested region of size %x, flags %x\n", |
|
+ size, flags); |
|
+ |
|
+ /* |
|
+ * Update free memory each time, which is a bit inefficient but guards us |
|
+ * against a situation where some OF driver goes out to firmware for |
|
+ * memory and we don't realise. |
|
+ */ |
|
+ grub_machine_mmap_iterate (count_free, &free_memory); |
|
+ |
|
+ /* Ensure we leave enough space to boot. */ |
|
+ if (free_memory <= RUNTIME_MIN_SPACE + size) |
|
+ { |
|
+ grub_dprintf ("ieee1275", "Cannot satisfy allocation and retain minimum runtime space\n"); |
|
+ return GRUB_ERR_OUT_OF_MEMORY; |
|
+ } |
|
+ |
|
+ if (free_memory > RUNTIME_MIN_SPACE) |
|
+ avail = free_memory - RUNTIME_MIN_SPACE; |
|
+ |
|
+ grub_dprintf ("ieee1275", "free = 0x%x available = 0x%x\n", free_memory, avail); |
|
+ |
|
+ if (flags & GRUB_MM_ADD_REGION_CONSECUTIVE) |
|
+ { |
|
+ /* first try rounding up hard for the sake of speed */ |
|
+ total = grub_max (ALIGN_UP (size, 1024 * 1024) + 1024 * 1024, 32 * 1024 * 1024); |
|
+ total = grub_min (avail, total); |
|
+ |
|
+ grub_dprintf ("ieee1275", "looking for %x bytes of memory (%x requested)\n", total, size); |
|
+ |
|
+ grub_machine_mmap_iterate (region_claim, &total); |
|
+ grub_dprintf ("ieee1275", "get memory from fw %s\n", total == 0 ? "succeeded" : "failed"); |
|
+ |
|
+ if (total != 0) |
|
+ { |
|
+ total = grub_min (avail, size); |
|
+ |
|
+ grub_dprintf ("ieee1275", "fallback for %x bytes of memory (%x requested)\n", total, size); |
|
+ |
|
+ grub_machine_mmap_iterate (region_claim, &total); |
|
+ grub_dprintf ("ieee1275", "fallback from fw %s\n", total == 0 ? "succeeded" : "failed"); |
|
+ } |
|
+ } |
|
+ else |
|
+ { |
|
+ /* provide padding for a grub_mm_header_t and region */ |
|
+ total = grub_min (avail, size); |
|
+ grub_machine_mmap_iterate (heap_init, &total); |
|
+ grub_dprintf ("ieee1275", "get noncontig memory from fw %s\n", total == 0 ? "succeeded" : "failed"); |
|
+ } |
|
+ |
|
+ if (total == 0) |
|
+ return GRUB_ERR_NONE; |
|
+ else |
|
+ return GRUB_ERR_OUT_OF_MEMORY; |
|
+} |
|
+ |
|
/* |
|
* How much memory does OF believe it has? (regardless of whether |
|
* it's accessible or not) |
|
@@ -356,17 +583,24 @@ grub_ieee1275_ibm_cas (void) |
|
static void |
|
grub_claim_heap (void) |
|
{ |
|
- unsigned long total = 0; |
|
+ grub_err_t err; |
|
+ grub_uint32_t total = HEAP_MAX_SIZE; |
|
+ |
|
+ err = grub_ieee1275_total_mem (&rmo_top); |
|
+ |
|
+ /* |
|
+ * If we cannot size the available memory, we can't be sure we're leaving |
|
+ * space for the kernel, initrd and things Linux loads early in boot. So only |
|
+ * allow further allocations from firmware on success |
|
+ */ |
|
+ if (err == GRUB_ERR_NONE) |
|
+ grub_mm_add_region_fn = grub_ieee1275_mm_add_region; |
|
|
|
#if defined(__powerpc__) |
|
if (grub_ieee1275_test_flag (GRUB_IEEE1275_FLAG_CAN_TRY_CAS_FOR_MORE_MEMORY)) |
|
{ |
|
- grub_uint64_t rma_size; |
|
- grub_err_t err; |
|
- |
|
- err = grub_ieee1275_total_mem (&rma_size); |
|
/* if we have an error, don't call CAS, just hope for the best */ |
|
- if (err == GRUB_ERR_NONE && rma_size < (512 * 1024 * 1024)) |
|
+ if (err == GRUB_ERR_NONE && rmo_top < (512 * 1024 * 1024)) |
|
grub_ieee1275_ibm_cas (); |
|
} |
|
#endif |
|
diff --git a/docs/grub-dev.texi b/docs/grub-dev.texi |
|
index 7b2455a8fe..7edc5b7e2b 100644 |
|
--- a/docs/grub-dev.texi |
|
+++ b/docs/grub-dev.texi |
|
@@ -1047,7 +1047,10 @@ space is limited to 4GiB. GRUB allocates pages from EFI for its heap, at most |
|
1.6 GiB. |
|
|
|
On i386-ieee1275 and powerpc-ieee1275 GRUB uses same stack as IEEE1275. |
|
-It allocates at most 32MiB for its heap. |
|
+ |
|
+On i386-ieee1275 and powerpc-ieee1275, GRUB will allocate 32MiB for its heap on |
|
+startup. It may allocate more at runtime, as long as at least 128MiB remain free |
|
+in OpenFirmware. |
|
|
|
On sparc64-ieee1275 stack is 256KiB and heap is 2MiB. |
|
|
|
@@ -1075,7 +1078,7 @@ In short: |
|
@item i386-qemu @tab 60 KiB @tab < 4 GiB |
|
@item *-efi @tab ? @tab < 1.6 GiB |
|
@item i386-ieee1275 @tab ? @tab < 32 MiB |
|
-@item powerpc-ieee1275 @tab ? @tab < 32 MiB |
|
+@item powerpc-ieee1275 @tab ? @tab available memory - 128MiB |
|
@item sparc64-ieee1275 @tab 256KiB @tab 2 MiB |
|
@item arm-uboot @tab 256KiB @tab 2 MiB |
|
@item mips(el)-qemu_mips @tab 2MiB @tab 253 MiB
|
|
|