Skip to content

Commit

Permalink
[PATCH] x86_64: actively synchronize vmalloc area when registering ce…
Browse files Browse the repository at this point in the history
…rtain callbacks

While the modular aspect of the respective i386 patch doesn't apply to
x86-64 (as the top level page directory entry is shared between modules
and the base kernel), handlers registered with register_die_notifier()
are still under similar constraints for touching ioremap()ed or
vmalloc()ed memory. The likelihood of this problem becoming visible is
of course significantly lower, as the assigned virtual addresses would
have to cross a 2**39 byte boundary. This is because the callback gets
invoked
(a) in the page fault path before the top level page table propagation
gets carried out (hence a fault to propagate the top level page table
entry/entries mapping to module's code/data would nest infinitly) and
(b) in the NMI path, where nested faults must absolutely not happen,
since otherwise the IRET from the nested fault re-enables NMIs,
potentially resulting in nested NMI occurences.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Jan Beulich authored and Linus Torvalds committed Mar 25, 2006
1 parent 85f9eeb commit 8c914cb
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 13 deletions.
1 change: 1 addition & 0 deletions arch/x86_64/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)

void set_nmi_callback(nmi_callback_t callback)
{
vmalloc_sync_all();
rcu_assign_pointer(nmi_callback, callback);
}

Expand Down
2 changes: 2 additions & 0 deletions arch/x86_64/kernel/traps.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ int register_die_notifier(struct notifier_block *nb)
{
int err = 0;
unsigned long flags;

vmalloc_sync_all();
spin_lock_irqsave(&die_notifier_lock, flags);
err = notifier_chain_register(&die_chain, nb);
spin_unlock_irqrestore(&die_notifier_lock, flags);
Expand Down
73 changes: 60 additions & 13 deletions arch/x86_64/mm/fault.c
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ static int vmalloc_fault(unsigned long address)
return -1;
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
else
BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));

/* Below here mismatches are bugs because these lower tables
are shared */
Expand Down Expand Up @@ -314,16 +316,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,

/* get the address */
__asm__("movq %%cr2,%0":"=r" (address));
if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;

if (likely(regs->eflags & X86_EFLAGS_IF))
local_irq_enable();

if (unlikely(page_fault_trace))
printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);

tsk = current;
mm = tsk->mm;
Expand Down Expand Up @@ -351,17 +343,30 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
*/
if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
((address >= VMALLOC_START && address < VMALLOC_END))) {
if (vmalloc_fault(address) < 0)
goto bad_area_nosemaphore;
return;
if (vmalloc_fault(address) >= 0)
return;
}
if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
*/
goto bad_area_nosemaphore;
}

if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;

if (likely(regs->eflags & X86_EFLAGS_IF))
local_irq_enable();

if (unlikely(page_fault_trace))
printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);

if (unlikely(error_code & PF_RSVD))
pgtable_bad(address, regs, error_code);

Expand Down Expand Up @@ -571,6 +576,48 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
return;
}

DEFINE_SPINLOCK(pgd_lock);
struct page *pgd_list;

void vmalloc_sync_all(void)
{
/* Note that races in the updates of insync and start aren't
problematic:
insync can only get set bits added, and updates to start are only
improving performance (without affecting correctness if undone). */
static DECLARE_BITMAP(insync, PTRS_PER_PGD);
static unsigned long start = VMALLOC_START & PGDIR_MASK;
unsigned long address;

for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
if (!test_bit(pgd_index(address), insync)) {
const pgd_t *pgd_ref = pgd_offset_k(address);
struct page *page;

if (pgd_none(*pgd_ref))
continue;
spin_lock(&pgd_lock);
for (page = pgd_list; page;
page = (struct page *)page->index) {
pgd_t *pgd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
else
BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
}
spin_unlock(&pgd_lock);
set_bit(pgd_index(address), insync);
}
if (address == start)
start = address + PGDIR_SIZE;
}
/* Check that there is no need to do the same for the modules area. */
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
}

static int __init enable_pagefaulttrace(char *str)
{
page_fault_trace = 1;
Expand Down
28 changes: 28 additions & 0 deletions include/asm-x86_64/pgalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,39 @@ static inline void pud_free (pud_t *pud)
free_page((unsigned long)pud);
}

static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);

spin_lock(&pgd_lock);
page->index = (pgoff_t)pgd_list;
if (pgd_list)
pgd_list->private = (unsigned long)&page->index;
pgd_list = page;
page->private = (unsigned long)&pgd_list;
spin_unlock(&pgd_lock);
}

static inline void pgd_list_del(pgd_t *pgd)
{
struct page *next, **pprev, *page = virt_to_page(pgd);

spin_lock(&pgd_lock);
next = (struct page *)page->index;
pprev = (struct page **)page->private;
*pprev = next;
if (next)
next->private = (unsigned long)pprev;
spin_unlock(&pgd_lock);
}

static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
unsigned boundary;
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
if (!pgd)
return NULL;
pgd_list_add(pgd);
/*
* Copy kernel pointers in from init.
* Could keep a freelist or slab cache of those because the kernel
Expand All @@ -67,6 +94,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
static inline void pgd_free(pgd_t *pgd)
{
BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
pgd_list_del(pgd);
free_page((unsigned long)pgd);
}

Expand Down
4 changes: 4 additions & 0 deletions include/asm-x86_64/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(x) ((pte_t) { (x).val })

extern spinlock_t pgd_lock;
extern struct page *pgd_list;
void vmalloc_sync_all(void);

#endif /* !__ASSEMBLY__ */

extern int kern_addr_valid(unsigned long addr);
Expand Down

0 comments on commit 8c914cb

Please sign in to comment.