Skip to content

Commit

Permalink
Merge branch 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/lin…
Browse files Browse the repository at this point in the history
…ux/kernel/git/tip/tip

Pull x86 vdso updates from Ingo Molnar:
 "Various vDSO updates from Andy Lutomirski, mostly cleanups and
  reorganization to improve maintainability, but also some
  micro-optimizations and robustization changes"

* 'x86-vdso-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86_64/vsyscall: Restore orig_ax after vsyscall seccomp
  x86_64: Add a comment explaining the TASK_SIZE_MAX guard page
  x86_64,vsyscall: Make vsyscall emulation configurable
  x86_64, vsyscall: Rewrite comment and clean up headers in vsyscall code
  x86_64, vsyscall: Turn vsyscalls all the way off when vsyscall==none
  x86,vdso: Use LSL unconditionally for vgetcpu
  x86: vdso: Fix build with older gcc
  x86_64/vdso: Clean up vgetcpu init and merge the vdso initcalls
  x86_64/vdso: Remove jiffies from the vvar page
  x86/vdso: Make the PER_CPU segment 32 bits
  x86/vdso: Make the PER_CPU segment start out accessed
  x86/vdso: Change the PER_CPU segment to use struct desc_struct
  x86_64/vdso: Move getcpu code from vsyscall_64.c to vdso/vma.c
  x86_64/vsyscall: Move all of the gate_area code to vsyscall_64.c
  • Loading branch information
Linus Torvalds committed Dec 10, 2014
2 parents c9f861c + 2689310 commit 3100e44
Show file tree
Hide file tree
Showing 16 changed files with 190 additions and 200 deletions.
18 changes: 18 additions & 0 deletions arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -992,6 +992,24 @@ config X86_ESPFIX64
def_bool y
depends on X86_16BIT && X86_64

config X86_VSYSCALL_EMULATION
bool "Enable vsyscall emulation" if EXPERT
default y
depends on X86_64
---help---
This enables emulation of the legacy vsyscall page. Disabling
it is roughly equivalent to booting with vsyscall=none, except
that it will also disable the helpful warning if a program
tries to use a vsyscall. With this option set to N, offending
programs will just segfault, citing addresses of the form
0xffffffffff600?00.

This option is required by many programs built before 2013, and
care should be used even with newer programs if set to N.

Disabling this option saves about 7K of kernel size and
possibly 4K of additional runtime pagetable memory.

config TOSHIBA
tristate "Toshiba Laptop support"
depends on X86_32
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/include/asm/fixmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ enum fixed_addresses {
#ifdef CONFIG_X86_32
FIX_HOLE,
#else
#ifdef CONFIG_X86_VSYSCALL_EMULATION
VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
#endif
#ifdef CONFIG_PARAVIRT_CLOCK
PVCLOCK_FIXMAP_BEGIN,
PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
Expand Down
4 changes: 3 additions & 1 deletion arch/x86/include/asm/page_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ void copy_page(void *to, void *from);

#endif /* !__ASSEMBLY__ */

#define __HAVE_ARCH_GATE_AREA 1
#ifdef CONFIG_X86_VSYSCALL_EMULATION
# define __HAVE_ARCH_GATE_AREA 1
#endif

#endif /* _ASM_X86_PAGE_64_H */
8 changes: 7 additions & 1 deletion arch/x86/include/asm/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);

#else
/*
* User space process size. 47bits minus one guard page.
* User space process size. 47bits minus one guard page. The guard
* page is necessary on Intel CPUs: if a SYSCALL instruction is at
* the highest possible canonical userspace address, then that
* syscall will enter the kernel with a non-canonical return
* address, and SYSRET will explode dangerously. We avoid this
* particular problem by preventing anything from being mapped
* at the maximum canonical address.
*/
#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)

Expand Down
19 changes: 19 additions & 0 deletions arch/x86/include/asm/vgtod.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,23 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s)
++s->seq;
}

#ifdef CONFIG_X86_64

#define VGETCPU_CPU_MASK 0xfff

static inline unsigned int __getcpu(void)
{
unsigned int p;

/*
* Load per CPU data from GDT. LSL is faster than RDTSCP and
* works on all CPUs.
*/
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));

return p;
}

#endif /* CONFIG_X86_64 */

#endif /* _ASM_X86_VGTOD_H */
33 changes: 6 additions & 27 deletions arch/x86/include/asm/vsyscall.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,20 @@
#include <linux/seqlock.h>
#include <uapi/asm/vsyscall.h>

#define VGETCPU_RDTSCP 1
#define VGETCPU_LSL 2

/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;

#include <asm/vvar.h>

#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);

/*
* Called on instruction fetch fault in vsyscall page.
* Returns true if handled.
*/
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);

#ifdef CONFIG_X86_64

#define VGETCPU_CPU_MASK 0xfff

static inline unsigned int __getcpu(void)
#else
static inline void map_vsyscall(void) {}
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
unsigned int p;

if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
native_read_tscp(&p);
} else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
}

return p;
return false;
}
#endif /* CONFIG_X86_64 */
#endif

#endif /* _ASM_X86_VSYSCALL_H */
2 changes: 0 additions & 2 deletions arch/x86/include/asm/vvar.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ extern char __vvar_page;

/* DECLARE_VVAR(offset, type, name) */

DECLARE_VVAR(0, volatile unsigned long, jiffies)
DECLARE_VVAR(16, int, vgetcpu_mode)
DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)

#undef DECLARE_VVAR
Expand Down
3 changes: 1 addition & 2 deletions arch/x86/kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
Expand Down
10 changes: 0 additions & 10 deletions arch/x86/kernel/cpu/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -958,14 +958,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
}

#ifdef CONFIG_X86_64
static void vgetcpu_set_mode(void)
{
if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
vgetcpu_mode = VGETCPU_RDTSCP;
else
vgetcpu_mode = VGETCPU_LSL;
}

#ifdef CONFIG_IA32_EMULATION
/* May not be __init: called during resume */
static void syscall32_cpu_init(void)
Expand Down Expand Up @@ -1008,8 +1000,6 @@ void __init identify_boot_cpu(void)
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
#else
vgetcpu_set_mode();
#endif
cpu_detect_tlb(&boot_cpu_data);
}
Expand Down
2 changes: 0 additions & 2 deletions arch/x86/kernel/setup.c
Original file line number Diff line number Diff line change
Expand Up @@ -1192,9 +1192,7 @@ void __init setup_arch(char **cmdline_p)

tboot_probe();

#ifdef CONFIG_X86_64
map_vsyscall();
#endif

generic_apic_probe();

Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kernel/time.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <asm/time.h>

#ifdef CONFIG_X86_64
__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES;
#endif

unsigned long profile_pc(struct pt_regs *regs)
Expand Down
147 changes: 64 additions & 83 deletions arch/x86/kernel/vsyscall_64.c
Original file line number Diff line number Diff line change
@@ -1,59 +1,43 @@
/*
* Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
*
* Based on the original implementation which is:
* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright 2003 Andi Kleen, SuSE Labs.
*
* [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
* Parts of the original code have been moved to arch/x86/vdso/vma.c
*
* This file implements vsyscall emulation. vsyscalls are a legacy ABI:
* Userspace can request certain kernel services by calling fixed
* addresses. This concept is problematic:
*
* Thanks to hpa@transmeta.com for some useful hint.
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux/IA32 and for the name.
* - It interferes with ASLR.
* - It's awkward to write code that lives in kernel addresses but is
* callable by userspace at fixed addresses.
* - The whole concept is impossible for 32-bit compat userspace.
* - UML cannot easily virtualize a vsyscall.
*
* vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
* at virtual address -10Mbyte+1024bytes etc... There are at max 4
* vsyscalls. One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary. We cannot add more with this
* mechanism because older kernels won't return -ENOSYS.
* As of mid-2014, I believe that there is no new userspace code that
* will use a vsyscall if the vDSO is present. I hope that there will
* soon be no new userspace code that will ever use a vsyscall.
*
* Note: the concept clashes with user mode linux. UML users should
* use the vDSO.
* The code in this file emulates vsyscalls when notified of a page
* fault to a vsyscall address.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/time.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/topology.h>
#include <linux/timekeeper_internal.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
#include <asm/compat.h>
#include <asm/page.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/traps.h>

#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"

DEFINE_VVAR(int, vgetcpu_mode);

static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;

static int __init vsyscall_setup(char *str)
Expand Down Expand Up @@ -222,6 +206,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
"seccomp tried to change syscall nr or ip");
do_exit(SIGSYS);
}
regs->orig_ax = -1;
if (tmp)
goto do_ret; /* skip requested */

Expand Down Expand Up @@ -284,71 +269,67 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
}

/*
* Assume __initcall executes before all user space. Hopefully kmod
* doesn't violate that. We'll find out if it does.
* A pseudo VMA to allow ptrace access for the vsyscall page. This only
* covers the 64bit vsyscall page now. 32bit has a real VMA now and does
* not need special handling anymore:
*/
static void vsyscall_set_cpu(int cpu)
static const char *gate_vma_name(struct vm_area_struct *vma)
{
unsigned long d;
unsigned long node = 0;
#ifdef CONFIG_NUMA
node = cpu_to_node(cpu);
#endif
if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
write_rdtscp_aux((node << 12) | cpu);

/*
* Store cpu number in limit so that it can be loaded quickly
* in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
*/
d = 0x0f40000000000ULL;
d |= cpu;
d |= (node & 0xf) << 12;
d |= (node >> 4) << 48;

write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
return "[vsyscall]";
}

static void cpu_vsyscall_init(void *arg)
static struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
.vm_flags = VM_READ | VM_EXEC,
.vm_ops = &gate_vma_ops,
};

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
/* preemption should be already off */
vsyscall_set_cpu(raw_smp_processor_id());
#ifdef CONFIG_IA32_EMULATION
if (!mm || mm->context.ia32_compat)
return NULL;
#endif
if (vsyscall_mode == NONE)
return NULL;
return &gate_vma;
}

static int
cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
long cpu = (long)arg;
struct vm_area_struct *vma = get_gate_vma(mm);

if (!vma)
return 0;

if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

return NOTIFY_DONE;
/*
* Use this when you have no reliable mm, typically from interrupt
* context. It is less reliable than using a task's mm and may give
* false positives.
*/
int in_gate_area_no_mm(unsigned long addr)
{
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}

void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);

__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
if (vsyscall_mode != NONE)
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);

BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}

static int __init vsyscall_init(void)
{
cpu_notifier_register_begin();

on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
__hotcpu_notifier(cpu_vsyscall_notifier, 30);

cpu_notifier_register_done();

return 0;
}
__initcall(vsyscall_init);
Loading

0 comments on commit 3100e44

Please sign in to comment.