From 2d6bff31399b2443aba230cd20f120acb2d4eeb1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 14 Feb 2025 11:06:48 +0200 Subject: [PATCH 01/12] x86/boot: Move setting of memblock parameters to e820__memblock_setup() Changing memblock parameters, namely bottom_up and allocation upper limit does not have any effect before memblock initialization in e820__memblock_setup(). Move the calls to memblock_set_bottom_up() and memblock_set_current_limit() to e820__memblock_setup() to group all the memblock initial setup and make setup_arch() more readable. No functional changes. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250214090651.3331663-2-rppt@kernel.org --- arch/x86/kernel/e820.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 25 ------------------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82b96ed9890a..8d8bd039d2ac 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1302,6 +1302,36 @@ void __init e820__memblock_setup(void) int i; u64 end; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + /* + * At this point only the first megabyte is mapped for sure, the + * rest of the memory cannot be used for memblock resizing + */ + memblock_set_current_limit(ISA_END_ADDRESS); + /* * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cebee310e200..4baadeab8a2e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,30 +866,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_memblock_x86_reserve_range(); -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - if (movable_node_is_enabled()) - memblock_set_bottom_up(true); -#endif - x86_report_nx(); apic_setup_apic_calls(); @@ -990,7 +966,6 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); /* From 297fb82ebaad50e1c40aab7ac61897bf372842ba Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 14 Feb 2025 11:06:49 +0200 Subject: [PATCH 02/12] x86/boot: Split kernel resources setup into the setup_kernel_resources() helper function Makes setup_arch() a bit easier to comprehend. No functional changes. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250214090651.3331663-3-rppt@kernel.org --- arch/x86/kernel/setup.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4baadeab8a2e..3d95946ab749 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -527,6 +527,23 @@ void __init reserve_standard_io_resources(void) } +static void __init setup_kernel_resources(void) +{ + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); +} + static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -845,15 +862,6 @@ void __init setup_arch(char **cmdline_p) root_mountflags &= ~MS_RDONLY; setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); - code_resource.start = __pa_symbol(_text); - code_resource.end = __pa_symbol(_etext)-1; - rodata_resource.start = __pa_symbol(__start_rodata); - rodata_resource.end = __pa_symbol(__end_rodata)-1; - data_resource.start = __pa_symbol(_sdata); - data_resource.end = __pa_symbol(_edata)-1; - bss_resource.start = __pa_symbol(__bss_start); - bss_resource.end = __pa_symbol(__bss_stop)-1; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -897,11 +905,11 @@ void __init setup_arch(char **cmdline_p) tsc_early_init(); x86_init.resources.probe_roms(); - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &rodata_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); + /* + * Add resources for kernel text and data to the iomem_resource. + * Do it after parse_early_param, so it can be debugged. + */ + setup_kernel_resources(); e820_add_kernel_range(); trim_bios_range(); From d45dd0a9b27ee8be7792ae186bc33ed700144224 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 14 Feb 2025 11:06:50 +0200 Subject: [PATCH 03/12] x86/boot: Split parsing of boot_params into the parse_boot_params() helper function Makes setup_arch() a bit easier to comprehend. No functional changes. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250214090651.3331663-4-rppt@kernel.org --- arch/x86/kernel/setup.c | 72 +++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d95946ab749..6fb9a851bc19 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -429,6 +429,46 @@ static void __init parse_setup_data(void) } } +/* + * Translate the fields of 'struct boot_param' into global variables + * representing these parameters. + */ +static void __init parse_boot_params(void) +{ + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; +} + static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_indirect *indirect; @@ -806,35 +846,7 @@ void __init setup_arch(char **cmdline_p) setup_olpc_ofw_pgd(); - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - if ((bootloader_type >> 4) == 0xe) { - bootloader_type &= 0xf; - bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; - } - bootloader_version = bootloader_type & 0xf; - bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI32_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI64_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - } -#endif + parse_boot_params(); x86_init.oem.arch_setup(); @@ -858,8 +870,6 @@ void __init setup_arch(char **cmdline_p) copy_edd(); - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); /* From efe659ac014647eb048d62475e55cce42d8951d7 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 14 Feb 2025 11:06:51 +0200 Subject: [PATCH 04/12] x86/e820: Drop obsolete E820_TYPE_RESERVED_KERN and related code E820_TYPE_RESERVED_KERN is a relict from the ancient history that was used to early reserve setup_data, see: 28bb22379513 ("x86: move reserve_setup_data to setup.c") Nowadays setup_data is anyway reserved in memblock and there is no point in carrying E820_TYPE_RESERVED_KERN that behaves exactly like E820_TYPE_RAM but only complicates the code. A bonus for removing E820_TYPE_RESERVED_KERN is a small but measurable speedup of 20 microseconds in init_mem_mappings() on a VM with 32GB or RAM. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ard Biesheuvel Cc: Kees Cook Link: https://lore.kernel.org/r/20250214090651.3331663-5-rppt@kernel.org --- arch/x86/include/asm/e820/api.h | 1 - arch/x86/include/asm/e820/types.h | 9 ----- arch/x86/kernel/e820.c | 65 ++----------------------------- arch/x86/kernel/setup.c | 1 - arch/x86/kernel/tboot.c | 3 +- arch/x86/mm/init_64.c | 8 ---- 6 files changed, 4 insertions(+), 83 deletions(-) diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 2e74a7f0e935..c83645d5b2a8 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -29,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void); extern u64 e820__memblock_alloc_reserved(u64 size, u64 align); extern void e820__memblock_setup(void); -extern void e820__reserve_setup_data(void); extern void e820__finish_early_params(void); extern void e820__reserve_resources(void); extern void e820__reserve_resources_late(void); diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 314f75d886d0..80c4a7266629 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -35,15 +35,6 @@ enum e820_type { * marking it with the IORES_DESC_SOFT_RESERVED designation. */ E820_TYPE_SOFT_RESERVED = 0xefffffff, - - /* - * Reserved RAM used by the kernel itself if - * CONFIG_INTEL_TXT=y is enabled, memory of this type - * will be included in the S3 integrity calculation - * and so should not include any memory that the BIOS - * might alter over the S3 transition: - */ - E820_TYPE_RESERVED_KERN = 128, }; /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8d8bd039d2ac..9fb67abc32cc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: /* Fall through: */ - case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RAM: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; @@ -764,7 +763,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) @@ -990,60 +989,6 @@ static int __init parse_memmap_opt(char *str) } early_param("memmap", parse_memmap_opt); -/* - * Reserve all entries from the bootloader's extensible data nodes list, - * because if present we are going to use it later on to fetch e820 - * entries from it: - */ -void __init e820__reserve_setup_data(void) -{ - struct setup_indirect *indirect; - struct setup_data *data; - u64 pa_data, pa_next; - u32 len; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - if (!data) { - pr_warn("e820: failed to memremap setup_data entry\n"); - return; - } - - len = sizeof(*data); - pa_next = data->next; - - e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - - if (data->type == SETUP_INDIRECT) { - len += data->len; - early_memunmap(data, sizeof(*data)); - data = early_memremap(pa_data, len); - if (!data) { - pr_warn("e820: failed to memremap indirect setup_data\n"); - return; - } - - indirect = (struct setup_indirect *)data->data; - - if (indirect->type != SETUP_INDIRECT) - e820__range_update(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } - - pa_data = pa_next; - early_memunmap(data, len); - } - - e820__update_table(e820_table); - - pr_info("extended physical RAM map:\n"); - e820__print_table("reserve setup_data"); -} - /* * Called after parse_early_param(), after early parameters (such as mem=) * have been processed, in which case we already have an E820 table filled in @@ -1063,7 +1008,6 @@ void __init e820__finish_early_params(void) static const char *__init e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return "System RAM"; case E820_TYPE_ACPI: return "ACPI Tables"; case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; @@ -1079,7 +1023,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; case E820_TYPE_ACPI: /* Fall-through: */ case E820_TYPE_NVS: /* Fall-through: */ @@ -1101,7 +1044,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ default: return IORES_DESC_NONE; @@ -1124,7 +1066,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; - case E820_TYPE_RESERVED_KERN: case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: @@ -1353,7 +1294,7 @@ void __init e820__memblock_setup(void) if (entry->type == E820_TYPE_SOFT_RESERVED) memblock_reserve(entry->addr, entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) continue; memblock_add(entry->addr, entry->size); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6fb9a851bc19..46d92d04b6da 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -895,7 +895,6 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } - e820__reserve_setup_data(); e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c1bcb6053fc..46b8f1f16676 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -200,8 +200,7 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; for (i = 0; i < e820_table->nr_entries; i++) { - if ((e820_table->entries[i].type != E820_TYPE_RAM) - && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) + if (e820_table->entries[i].type != E820_TYPE_RAM) continue; add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 01ea7c6df303..519aa53114fa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -468,8 +468,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PAGE_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_ACPI)) set_pte_init(pte, __pte(0), init); @@ -525,8 +523,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PMD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_ACPI)) set_pmd_init(pmd, __pmd(0), init); @@ -614,8 +610,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & PUD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_ACPI)) set_pud_init(pud, __pud(0), init); @@ -703,8 +697,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, if (!after_bootmem && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RAM) && - !e820__mapped_any(paddr & P4D_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_ACPI)) set_p4d_init(p4d, __p4d(0), init); From 5bebe2e4fe27bf68b4993d62be2f8bae9e6229d6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 29 Jan 2025 16:47:50 +0100 Subject: [PATCH 05/12] x86/boot: Change some static bootflag functions to bool The return values of some functions are of boolean type. Change the type of these function to bool and adjust their return values. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250129154920.6773-1-ubizjak@gmail.com --- arch/x86/kernel/bootflag.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 3fed7ae58b60..4d89a2d80d0f 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -20,7 +20,7 @@ int sbf_port __initdata = -1; /* set via acpi_boot_init() */ -static int __init parity(u8 v) +static bool __init parity(u8 v) { int x = 0; int i; @@ -30,7 +30,7 @@ static int __init parity(u8 v) v >>= 1; } - return x; + return !!x; } static void __init sbf_write(u8 v) @@ -66,14 +66,14 @@ static u8 __init sbf_read(void) return v; } -static int __init sbf_value_valid(u8 v) +static bool __init sbf_value_valid(u8 v) { if (v & SBF_RESERVED) /* Reserved bits */ - return 0; + return false; if (!parity(v)) - return 0; + return false; - return 1; + return true; } static int __init sbf_init(void) From a2498e5c453b3d8d054d77751487cd593332f8c2 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Tue, 28 Jan 2025 21:32:31 +0800 Subject: [PATCH 06/12] x86/kexec: Export e820_table_kexec[] to sysfs Previously the e820_table_kexec[] was exported to sysfs since kexec-tools uses the memmap entries to prepare the e820 table for the new kernel. The following commit, ~8 years ago, introduced e820_table_firmware[] and changed the behavior to export the firmware table instead: 12df216c61c8 ("x86/boot/e820: Introduce the bootloader provided e820_table_firmware[] table") Originally the kexec_file_load and kexec_load syscalls both used e820_table_kexec[]. Since the sysfs exported entries are from e820_table_firmware[] people now need to tune both tables for kexec. Restore the old behavior so the kexec_load and kexec_file_load syscalls work with only one table update. The e820_table_firmware[] is used by hibernation kernel code and it works without the sysfs exporting. Also remove the SEV e820_table_firmware[] updating code. Also update the code comments here and drop the comments about setup_data reservation since it is not needed any more after this change was made a year ago: fc7f27cda843 ("x86/kexec: Do not update E820 kexec table for setup_data") [ mingo: Tidy up the changelog and comments. ] Signed-off-by: Dave Young Signed-off-by: Ingo Molnar Cc: Ashish Kalra Cc: Sean Christopherson Cc: Joerg Roedel Cc: Baoquan He Cc: Vivek Goyal Cc: Eric Biederman Link: https://lore.kernel.org/r/Z5jcb1GKhLvH8kDc@darkstar.users.ipa.redhat.com --- arch/x86/kernel/e820.c | 20 ++++++++++---------- arch/x86/virt/svm/sev.c | 1 - 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9fb67abc32cc..57120f0749cc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -28,18 +28,13 @@ * the first 128 E820 memory entries in boot_params.e820_table and the remaining * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * - * - inform the user about the firmware's notion of memory layout - * via /sys/firmware/memmap - * * - the hibernation code uses it to generate a kernel-independent CRC32 * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between - * e820_table_firmware[] and this one is that, the latter marks the setup_data - * list created by the EFI boot stub as reserved, so that kexec can reuse the - * setup_data information in the second kernel. Besides, e820_table_kexec[] - * might also be modified by the kexec itself to fake a mptable. + * e820_table_firmware[] and this one is that e820_table_kexec[] + * might be modified by the kexec itself to fake an mptable. * We use this to: * * - kexec, which is a bootloader in disguise, uses the original E820 @@ -47,6 +42,11 @@ * can have a restricted E820 map while the kexec()-ed kexec-kernel * can have access to full memory - etc. * + * Export the memory layout via /sys/firmware/memmap. kexec-tools uses + * the entries to create an E820 table for the kexec kernel. + * + * kexec_file_load in-kernel code uses the table for the kexec kernel. + * * - 'e820_table': this is the main E820 table that is massaged by the * low level x86 platform code, or modified by boot parameters, before * passed on to higher level MM layers. @@ -1117,9 +1117,9 @@ void __init e820__reserve_resources(void) res++; } - /* Expose the bootloader-provided memory layout to the sysfs. */ - for (i = 0; i < e820_table_firmware->nr_entries; i++) { - struct e820_entry *entry = e820_table_firmware->entries + i; + /* Expose the kexec e820 table to the sysfs. */ + for (i = 0; i < e820_table_kexec->nr_entries; i++) { + struct e820_entry *entry = e820_table_kexec->entries + i; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 42e74a5a7d78..fc473ca12c44 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -198,7 +198,6 @@ static void __init __snp_fixup_e820_tables(u64 pa) pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); - e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); if (!memblock_is_region_reserved(pa, PMD_SIZE)) memblock_reserve(pa, PMD_SIZE); } From 7d8f03f7dd9f7d108b8d5af12fdc57e10555981f Mon Sep 17 00:00:00 2001 From: Zhou Ding Date: Wed, 18 Dec 2024 00:28:59 +0800 Subject: [PATCH 07/12] x86/boot: Add missing has_cpuflag() prototype MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get a warning when building the kernel with W=1: arch/x86/boot/compressed/cpuflags.c:4:6: warning: no previous prototype for ‘has_cpuflag’ [-Werror=missing-prototypes] 4 | bool has_cpuflag(int flag) | ^~~~~~~~~~~ Add a function declaration to cpuflags.h Signed-off-by: Zhou Ding Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241217162859.1167889-1-zhouding@cmss.chinamobile.com --- arch/x86/boot/cpuflags.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 475b8fde90f7..fdcc2aa4c3c4 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -18,5 +18,6 @@ extern u32 cpu_vendor[3]; int has_eflag(unsigned long mask); void get_cpuflags(void); void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); +bool has_cpuflag(int flag); #endif From adf6819278ba34be1d29ffcdca5c2ccd2123f667 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 26 Feb 2025 16:36:56 +0100 Subject: [PATCH 08/12] x86/bootflag: Micro-optimize sbf_write() Change parity bit with XOR when !parity instead of masking bit out and conditionally setting it when !parity. Saves a couple of bytes in the object file. Co-developed-by: "H. Peter Anvin" Signed-off-by: "H. Peter Anvin" Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250226153709.6370-1-ubizjak@gmail.com --- arch/x86/kernel/bootflag.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 4d89a2d80d0f..b935c3e42bfd 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -38,9 +38,8 @@ static void __init sbf_write(u8 v) unsigned long flags; if (sbf_port != -1) { - v &= ~SBF_PARITY; if (!parity(v)) - v |= SBF_PARITY; + v ^= SBF_PARITY; printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); From 9c94c14ca39577b6324c667d8450ffa19fc1e5c4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Thu, 27 Feb 2025 13:55:45 +0100 Subject: [PATCH 09/12] x86/bootflag: Replace open-coded parity calculation with parity8() Refactor parity calculations to use the standard parity8() helper. This change eliminates redundant implementations and improves code efficiency. [ ubizjak: Updated the patch to apply to the latest x86 tree. ] Co-developed-by: Yu-Chun Lin Signed-off-by: Yu-Chun Lin Signed-off-by: Kuan-Wei Chiu Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20250227125616.2253774-1-ubizjak@gmail.com --- arch/x86/kernel/bootflag.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index b935c3e42bfd..73274d76ce16 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,25 +21,12 @@ int sbf_port __initdata = -1; /* set via acpi_boot_init() */ -static bool __init parity(u8 v) -{ - int x = 0; - int i; - - for (i = 0; i < 8; i++) { - x ^= (v & 1); - v >>= 1; - } - - return !!x; -} - static void __init sbf_write(u8 v) { unsigned long flags; if (sbf_port != -1) { - if (!parity(v)) + if (!parity8(v)) v ^= SBF_PARITY; printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", @@ -69,7 +57,7 @@ static bool __init sbf_value_valid(u8 v) { if (v & SBF_RESERVED) /* Reserved bits */ return false; - if (!parity(v)) + if (!parity8(v)) return false; return true; From 558fc8e1869ca6e1eb99a1e2b52f6c35424d4adf Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 7 Mar 2025 10:10:03 +0100 Subject: [PATCH 10/12] x86/boot: Do not test if AC and ID eflags are changeable on x86_64 The test for the changeabitily of AC and ID EFLAGS is used to distinguish between i386 and i486 processors (AC) and to test for CPUID instruction support (ID). Skip these tests on x86_64 processors as they always supports CPUID. Also change the return type of has_eflag() to bool. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: H. Peter Anvin Cc: Linus Torvalds Cc: Brian Gerst Cc: Nathan Chancellor Cc: Nick Desaulniers Link: https://lore.kernel.org/r/20250307091022.181136-1-ubizjak@gmail.com --- arch/x86/boot/cpuflags.c | 26 +++++++++----------------- arch/x86/boot/cpuflags.h | 6 +++++- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index d75237ba7ce9..2150a016176f 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -29,40 +29,32 @@ static int has_fpu(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +#ifdef CONFIG_X86_32 /* * For building the 16-bit code we want to explicitly specify 32-bit * push/pop operations, rather than just saying 'pushf' or 'popf' and - * letting the compiler choose. But this is also included from the - * compressed/ directory where it may be 64-bit code, and thus needs - * to be 'pushfq' or 'popfq' in that case. + * letting the compiler choose. */ -#ifdef __x86_64__ -#define PUSHF "pushfq" -#define POPF "popfq" -#else -#define PUSHF "pushfl" -#define POPF "popfl" -#endif - -int has_eflag(unsigned long mask) +bool has_eflag(unsigned long mask) { unsigned long f0, f1; - asm volatile(PUSHF " \n\t" - PUSHF " \n\t" + asm volatile("pushfl \n\t" + "pushfl \n\t" "pop %0 \n\t" "mov %0,%1 \n\t" "xor %2,%1 \n\t" "push %1 \n\t" - POPF " \n\t" - PUSHF " \n\t" + "popfl \n\t" + "pushfl \n\t" "pop %1 \n\t" - POPF + "popfl" : "=&r" (f0), "=&r" (f1) : "ri" (mask)); return !!((f0^f1) & mask); } +#endif void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index fdcc2aa4c3c4..a398d9204ad0 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -15,7 +15,11 @@ struct cpu_features { extern struct cpu_features cpu; extern u32 cpu_vendor[3]; -int has_eflag(unsigned long mask); +#ifdef CONFIG_X86_32 +bool has_eflag(unsigned long mask); +#else +static inline bool has_eflag(unsigned long mask) { return true; } +#endif void get_cpuflags(void); void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); bool has_cpuflag(int flag); From e27dffba1b1d6c047b48c325adcb1342d3e3fa69 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 13 Mar 2025 13:03:25 +0100 Subject: [PATCH 11/12] x86/boot: Move the LA57 trampoline to separate source file To permit the EFI stub to call this code even when building the kernel without the legacy decompressor, move the trampoline out of the latter's startup code. This is part of an ongoing WIP effort on my part to make the existing, generic EFI zboot format work on x86 as well. No functional change intended. Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250313120324.1095968-2-ardb+git@google.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/head_64.S | 103 ----------------------- arch/x86/boot/compressed/la57toggle.S | 112 ++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 103 deletions(-) create mode 100644 arch/x86/boot/compressed/la57toggle.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 606c74f27459..0e0b238e8363 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -98,6 +98,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o + vmlinux-objs-y += $(obj)/la57toggle.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1dcb794c5479..3dc86352cdbe 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -483,110 +483,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%rax SYM_FUNC_END(.Lrelocated) -/* - * This is the 32-bit trampoline that will be copied over to low memory. It - * will be called using the ordinary 64-bit calling convention from code - * running in 64-bit mode. - * - * Return address is at the top of the stack (might be above 4G). - * The first argument (EDI) contains the address of the temporary PGD level - * page table in 32-bit addressable memory which will be programmed into - * register CR3. - */ - .section ".rodata", "a", @progbits -SYM_CODE_START(trampoline_32bit_src) - /* - * Preserve callee save 64-bit registers on the stack: this is - * necessary because the architecture does not guarantee that GPRs will - * retain their full 64-bit values across a 32-bit mode switch. - */ - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbp - pushq %rbx - - /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ - movq %rsp, %rbx - shrq $32, %rbx - - /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ - pushq $__KERNEL32_CS - leaq 0f(%rip), %rax - pushq %rax - lretq - - /* - * The 32-bit code below will do a far jump back to long mode and end - * up here after reconfiguring the number of paging levels. First, the - * stack pointer needs to be restored to its full 64-bit value before - * the callee save register contents can be popped from the stack. - */ -.Lret: - shlq $32, %rbx - orq %rbx, %rsp - - /* Restore the preserved 64-bit registers */ - popq %rbx - popq %rbp - popq %r12 - popq %r13 - popq %r14 - popq %r15 - retq - .code32 -0: - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Point CR3 to the trampoline's new top level page table */ - movl %edi, %cr3 - - /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - /* Avoid writing EFER if no change was made (for TDX guest) */ - jc 1f - wrmsr -1: - /* Toggle CR4.LA57 */ - movl %cr4, %eax - btcl $X86_CR4_LA57_BIT, %eax - movl %eax, %cr4 - - /* Enable paging again. */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* - * Return to the 64-bit calling code using LJMP rather than LRET, to - * avoid the need for a 32-bit addressable stack. The destination - * address will be adjusted after the template code is copied into a - * 32-bit addressable buffer. - */ -.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) -SYM_CODE_END(trampoline_32bit_src) - -/* - * This symbol is placed right after trampoline_32bit_src() so its address can - * be used to infer the size of the trampoline code. - */ -SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) - - /* - * The trampoline code has a size limit. - * Make sure we fail to compile if the trampoline code grows - * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. - */ - .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE - - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/compressed/la57toggle.S new file mode 100644 index 000000000000..9ee002387eb1 --- /dev/null +++ b/arch/x86/boot/compressed/la57toggle.S @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include "pgtable.h" + +/* + * This is the 32-bit trampoline that will be copied over to low memory. It + * will be called using the ordinary 64-bit calling convention from code + * running in 64-bit mode. + * + * Return address is at the top of the stack (might be above 4G). + * The first argument (EDI) contains the address of the temporary PGD level + * page table in 32-bit addressable memory which will be programmed into + * register CR3. + */ + + .section ".rodata", "a", @progbits +SYM_CODE_START(trampoline_32bit_src) + /* + * Preserve callee save 64-bit registers on the stack: this is + * necessary because the architecture does not guarantee that GPRs will + * retain their full 64-bit values across a 32-bit mode switch. + */ + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ + movq %rsp, %rbx + shrq $32, %rbx + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq 0f(%rip), %rax + pushq %rax + lretq + + /* + * The 32-bit code below will do a far jump back to long mode and end + * up here after reconfiguring the number of paging levels. First, the + * stack pointer needs to be restored to its full 64-bit value before + * the callee save register contents can be popped from the stack. + */ +.Lret: + shlq $32, %rbx + orq %rbx, %rsp + + /* Restore the preserved 64-bit registers */ + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + retq + + .code32 +0: + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Point CR3 to the trampoline's new top level page table */ + movl %edi, %cr3 + + /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f + wrmsr +1: + /* Toggle CR4.LA57 */ + movl %cr4, %eax + btcl $X86_CR4_LA57_BIT, %eax + movl %eax, %cr4 + + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* + * Return to the 64-bit calling code using LJMP rather than LRET, to + * avoid the need for a 32-bit addressable stack. The destination + * address will be adjusted after the template code is copied into a + * 32-bit addressable buffer. + */ +.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) +SYM_CODE_END(trampoline_32bit_src) + +/* + * This symbol is placed right after trampoline_32bit_src() so its address can + * be used to infer the size of the trampoline code. + */ +SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE From b25eb5f5e419b81f124d5ba2abaaacf1948fb97e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 12 Mar 2025 14:34:13 +0000 Subject: [PATCH 12/12] x86/kexec: Add relocate_kernel() debugging support: Load a GDT There are some failure modes which lead to triple-faults in the relocate_kernel() function, which is fairly much undebuggable for normal mortals. Adding a GDT in the relocate_kernel() environment is step 1 towards being able to catch faults and do something more useful. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Kees Cook Cc: Ard Biesheuvel Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250312144257.2348250-2-dwmw2@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index b44d8863e57f..ac058971a382 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -40,6 +40,16 @@ SYM_DATA(kexec_pa_table_page, .quad 0) SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) + .balign 16 +SYM_DATA_START_LOCAL(kexec_debug_gdt) + .word kexec_debug_gdt_end - kexec_debug_gdt - 1 + .long 0 + .word 0 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + .section .text..relocate_kernel,"ax"; .code64 SYM_CODE_START_NOALIGN(relocate_kernel) @@ -116,6 +126,19 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* store the start address on the stack */ pushq %rdx + /* Create a GDTR (16 bits limit, 64 bits addr) on stack */ + leaq kexec_debug_gdt(%rip), %rax + pushq %rax + pushw (%rax) + + /* Load the GDT, put the stack back */ + lgdt (%rsp) + addq $10, %rsp + + /* Test that we can load segments */ + movq %ds, %rax + movq %rax, %ds + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below.