Skip to content

Commit

Permalink
x86/smpboot: Remove initial_stack on 64-bit
Browse files Browse the repository at this point in the history
In order to facilitate parallel startup, start to eliminate some of the
global variables passing information to CPUs in the startup path.

However, start by introducing one more: smpboot_control. For now this
merely holds the CPU# of the CPU which is coming up. Each CPU can then
find its own per-cpu data, and everything else it needs can be found
from there, allowing the other global variables to be removed.

First to be removed is initial_stack. Each CPU can load %rsp from its
current_task->thread.sp instead. That is already set up with the correct
idle thread for APs. Set up the .sp field in INIT_THREAD on x86 so that
the BSP also finds a suitable stack pointer in the static per-cpu data
when coming up on first boot.

On resume from S3, the CPU needs a temporary stack because its idle task
is already active. Instead of setting initial_stack, the sleep code can
simply set its own current->thread.sp to point to the temporary stack.
Nobody else cares about ->thread.sp for a thread which is currently on
a CPU, because the true value is actually in the %rsp register. Which
is restored with the rest of the CPU context in do_suspend_lowlevel().

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Usama Arif <usama.arif@bytedance.com>
Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lore.kernel.org/r/20230316222109.1940300-7-usama.arif@bytedance.com
  • Loading branch information
Brian Gerst authored and Thomas Gleixner committed Mar 21, 2023
1 parent cefad86 commit 3adee77
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 21 deletions.
6 changes: 5 additions & 1 deletion arch/x86/include/asm/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,11 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)

#else
#define INIT_THREAD { }
extern unsigned long __end_init_task[];

#define INIT_THREAD { \
.sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
}

extern unsigned long KSTK_ESP(struct task_struct *task);

Expand Down
5 changes: 4 additions & 1 deletion arch/x86/include/asm/smp.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,5 +199,8 @@ extern void nmi_selftest(void);
#define nmi_selftest() do { } while (0)
#endif

#endif /* __ASSEMBLY__ */
extern unsigned int smpboot_control;

#endif /* !__ASSEMBLY__ */

#endif /* _ASM_X86_SMP_H */
20 changes: 18 additions & 2 deletions arch/x86/kernel/acpi/sleep.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,29 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
/*
* As each CPU starts up, it will find its own stack pointer
* from its current_task->thread.sp. Typically that will be
* the idle thread for a newly-started AP, or even the boot
* CPU which will find it set to &init_task in the static
* per-cpu data.
*
* Make the resuming CPU use the temporary stack at startup
* by setting current->thread.sp to point to that. The true
* %rsp will be restored with the rest of the CPU context,
* by do_suspend_lowlevel(). And unwinders don't care about
* the abuse of ->thread.sp because it's a dead variable
* while the thread is running on the CPU anyway; the true
* value is in the actual %rsp register.
*/
current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0L;
saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */

/*
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kernel/asm-offsets.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ static void __used common(void)
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
OFFSET(X86_current_task, pcpu_hot, current_task);
#ifdef CONFIG_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
Expand Down
43 changes: 28 additions & 15 deletions arch/x86/kernel/head_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* tables and then reload them.
*/

/* Set up the stack for verify_cpu(), similar to initial_stack below */
leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp

leaq _text(%rip), %rdi

Expand Down Expand Up @@ -241,6 +241,24 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
ANNOTATE_NOENDBR // above

#ifdef CONFIG_SMP
movl smpboot_control(%rip), %ecx

/* Get the per cpu offset for the given CPU# which is in ECX */
movq __per_cpu_offset(,%rcx,8), %rdx
#else
xorl %edx, %edx /* zero-extended to clear all of RDX */
#endif /* CONFIG_SMP */

/*
* Setup a boot time stack - Any secondary CPU will have lost its stack
* by now because the cr3-switch above unmaps the real-mode stack.
*
* RDX contains the per-cpu offset
*/
movq pcpu_hot + X86_current_task(%rdx), %rax
movq TASK_threadsp(%rax), %rsp

/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
Expand Down Expand Up @@ -275,12 +293,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movl initial_gs+4(%rip),%edx
wrmsr

/*
* Setup a boot time stack - Any secondary CPU will have lost its stack
* by now because the cr3-switch above unmaps the real-mode stack
*/
movq initial_stack(%rip), %rsp

/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
Expand Down Expand Up @@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64)
SYM_CODE_START(start_cpu0)
ANNOTATE_NOENDBR
UNWIND_HINT_EMPTY
movq initial_stack(%rip), %rsp

/* Find the idle task stack */
movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
movq TASK_threadsp(%rcx), %rsp

jmp .Ljump_to_C_code
SYM_CODE_END(start_cpu0)
#endif
Expand Down Expand Up @@ -420,12 +436,6 @@ SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif

/*
* The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
* reliably detect the end of the stack.
*/
SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA

__INIT
Expand Down Expand Up @@ -660,6 +670,9 @@ SYM_DATA_END(level1_fixmap_pgt)
SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))

.align 16
SYM_DATA(smpboot_control, .long 0)

.align 16
/* This must match the first entry in level2_kernel_pgt */
SYM_DATA(phys_base, .quad 0x0)
Expand Down
7 changes: 6 additions & 1 deletion arch/x86/kernel/smpboot.c
Original file line number Diff line number Diff line change
Expand Up @@ -1088,7 +1088,12 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;

if (IS_ENABLED(CONFIG_X86_32)) {
initial_stack = idle->thread.sp;
} else {
smpboot_control = cpu;
}

/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/xen/xen-head.S
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
ANNOTATE_NOENDBR
cld

mov initial_stack(%rip), %rsp
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp

/* Set up %gs.
*
Expand Down

0 comments on commit 3adee77

Please sign in to comment.