Skip to content

Commit

Permalink
KVM: MMU: out of sync shadow core
Browse files Browse the repository at this point in the history
Allow guest pagetables to go out of sync.  Instead of emulating write
accesses to guest pagetables, or unshadowing them, we un-write-protect
the page table and allow the guest to modify it at will.  We rely on
invlpg executions to synchronize individual ptes, and will synchronize
the entire pagetable on tlb flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
  • Loading branch information
Marcelo Tosatti authored and Avi Kivity committed Oct 15, 2008
1 parent 6844dec commit 4731d4c
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 18 deletions.
210 changes: 193 additions & 17 deletions arch/x86/kvm/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ struct kvm_shadow_walk {
u64 addr, u64 *spte, int level);
};

struct kvm_unsync_walk {
int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
};

typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);

static struct kmem_cache *pte_chain_cache;
Expand Down Expand Up @@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)

if (write_protected)
kvm_flush_remote_tlbs(kvm);

account_shadowed(kvm, gfn);
}

static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
Expand Down Expand Up @@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}

static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_unsync_walk *walker)
{
int i, ret;

if (!sp->unsync_children)
return 0;

for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
u64 ent = sp->spt[i];

if (is_shadow_present_pte(ent)) {
struct kvm_mmu_page *child;
child = page_header(ent & PT64_BASE_ADDR_MASK);

if (child->unsync_children) {
ret = mmu_unsync_walk(child, walker);
if (ret)
return ret;
}

if (child->unsync) {
ret = walker->entry(child, walker);
if (ret)
return ret;
}
}
}

if (i == PT64_ENT_PER_PAGE)
sp->unsync_children = 0;

return 0;
}

static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
Expand All @@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}

static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
}

static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);

static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
if (sp->role.glevels != vcpu->arch.mmu.root_level) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}

rmap_write_protect(vcpu->kvm, sp->gfn);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}

kvm_mmu_flush_tlb(vcpu);
kvm_unlink_unsync_page(vcpu->kvm, sp);
return 0;
}

struct sync_walker {
struct kvm_vcpu *vcpu;
struct kvm_unsync_walk walker;
};

static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
walker);
struct kvm_vcpu *vcpu = sync_walk->vcpu;

kvm_sync_page(vcpu, sp);
return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
}

static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
struct sync_walker walker = {
.walker = { .entry = mmu_sync_fn, },
.vcpu = vcpu,
};

while (mmu_unsync_walk(sp, &walker.walker))
cond_resched_lock(&vcpu->kvm->mmu_lock);
}

static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
Expand All @@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *sp;
struct hlist_node *node;
struct hlist_node *node, *tmp;

role.word = 0;
role.glevels = vcpu->arch.mmu.root_level;
Expand All @@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) {
hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
if (sp->gfn == gfn) {
if (sp->unsync)
if (kvm_sync_page(vcpu, sp))
continue;

if (sp->role.word != role.word)
continue;

if (sp->unsync_children)
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);

mmu_page_add_parent_pte(vcpu, sp, parent_pte);
pgprintk("%s: found\n", __func__);
return sp;
Expand All @@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
if (!metaphysical)
if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn);
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
Expand Down Expand Up @@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
}

struct zap_walker {
struct kvm_unsync_walk walker;
struct kvm *kvm;
int zapped;
};

static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
walker);
kvm_mmu_zap_page(zap_walk->kvm, sp);
zap_walk->zapped = 1;
return 0;
}

static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct zap_walker walker = {
.walker = { .entry = mmu_zap_fn, },
.kvm = kvm,
.zapped = 0,
};

if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
mmu_unsync_walk(sp, &walker.walker);
return walker.zapped;
}

static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
int ret;
++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.metaphysical)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
Expand All @@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_reload_remote_mmus(kvm);
}
kvm_mmu_reset_last_pte_updated(kvm);
return 0;
return ret;
}

/*
Expand Down Expand Up @@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
return page;
}

static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
sp->unsync_children = 1;
return 1;
}

static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
unsigned index;
struct hlist_head *bucket;
struct kvm_mmu_page *s;
struct hlist_node *node, *n;

index = kvm_page_table_hashfn(sp->gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
/* don't unsync if pagetable is shadowed with multiple roles */
hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
if (s->gfn != sp->gfn || s->role.metaphysical)
continue;
if (s->role.word != sp->role.word)
return 1;
}
mmu_parent_walk(vcpu, sp, unsync_walk_fn);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
mmu_convert_notrap(sp);
return 0;
}

static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
struct kvm_mmu_page *shadow;

shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow) {
if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
if (shadow->unsync)
return 0;
if (can_unsync)
return kvm_unsync_page(vcpu, shadow);
return 1;
}
return 0;
}

static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
gfn_t gfn, pfn_t pfn, bool speculative)
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
Expand All @@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,

if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;

if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
ret = 1;
Expand All @@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,

spte |= PT_WRITABLE_MASK;

shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow) {
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
ret = 1;
Expand All @@ -1260,7 +1441,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
return ret;
}


static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
Expand Down Expand Up @@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
dirty, largepage, gfn, pfn, speculative)) {
dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
Expand Down Expand Up @@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
}

static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
}

static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/paging_tmpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true);
spte_to_pfn(sp->spt[i]), true, false);
}

return !nr_present;
Expand Down
3 changes: 3 additions & 0 deletions arch/x86/kvm/x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
Expand Down Expand Up @@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
Expand Down
3 changes: 3 additions & 0 deletions include/asm-x86/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ struct kvm_mmu_page {
*/
int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
bool unsync;
bool unsync_children;
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
Expand Down Expand Up @@ -371,6 +373,7 @@ struct kvm_vm_stat {
u32 mmu_flooded;
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 mmu_unsync;
u32 remote_tlb_flush;
u32 lpages;
};
Expand Down
1 change: 1 addition & 0 deletions include/linux/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#define KVM_REQ_TRIPLE_FAULT 4
#define KVM_REQ_PENDING_TIMER 5
#define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7

struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache;
Expand Down

0 comments on commit 4731d4c

Please sign in to comment.