Skip to content

Commit

Permalink
KVM: x86/mmu: Handle page fault for private memory
Browse files Browse the repository at this point in the history
Add support for resolving page faults on guest private memory for VMs
that differentiate between "shared" and "private" memory.  For such VMs,
KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and
hva-based shared memory, and KVM needs to map in the "correct" variant,
i.e. KVM needs to map the gfn shared/private as appropriate based on the
current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag.

For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request
shared vs. private via a bit in the guest page tables, i.e. what the guest
wants may conflict with the current memory attributes.  To support such
"implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT
to forward the request to userspace.  Add a new flag for memory faults,
KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to
map memory as shared vs. private.

Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory
so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace
needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to
exit on missing mappings when handling guest page fault VM-Exits.  In
that case, userspace will want to know RWX information in order to
correctly/precisely resolve the fault.

Note, private memory *must* be backed by guest_memfd, i.e. shared mappings
always come from the host userspace page tables, and private mappings
always come from a guest_memfd instance.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Message-Id: <20231027182217.3615211-21-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
  • Loading branch information
Chao Peng authored and Paolo Bonzini committed Nov 14, 2023
1 parent 90b4fe1 commit 8dd2eee
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 9 deletions.
8 changes: 6 additions & 2 deletions Documentation/virt/kvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6952,6 +6952,7 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.

/* KVM_EXIT_MEMORY_FAULT */
struct {
#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
__u64 flags;
__u64 gpa;
__u64 size;
Expand All @@ -6960,8 +6961,11 @@ spec refer, https://github.com/riscv/riscv-sbi-doc.
KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that
could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the
guest physical address range [gpa, gpa + size) of the fault. The 'flags' field
describes properties of the faulting access that are likely pertinent.
Currently, no flags are defined.
describes properties of the faulting access that are likely pertinent:

- KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred
on a private memory access. When clear, indicates the fault occurred on a
shared access.

Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it
accompanies a return code of '-1', not '0'! errno will always be set to EFAULT
Expand Down
101 changes: 96 additions & 5 deletions arch/x86/kvm/mmu/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3147,9 +3147,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
return level;
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
int max_level)
static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t gfn, int max_level, bool is_private)
{
struct kvm_lpage_info *linfo;
int host_level;
Expand All @@ -3161,13 +3161,26 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}

if (is_private)
return max_level;

if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;

host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level);
}

int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
int max_level)
{
bool is_private = kvm_slot_can_be_private(slot) &&
kvm_mem_is_private(kvm, gfn);

return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
}

void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
Expand All @@ -3188,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
fault->gfn, fault->max_level);
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
fault->gfn, fault->max_level,
fault->is_private);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;

Expand Down Expand Up @@ -4269,6 +4283,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}

static inline u8 kvm_max_level_for_order(int order)
{
BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);

KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));

if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
return PG_LEVEL_1G;

if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
return PG_LEVEL_2M;

return PG_LEVEL_4K;
}

static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
PAGE_SIZE, fault->write, fault->exec,
fault->is_private);
}

static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
int max_order, r;

if (!kvm_slot_can_be_private(fault->slot)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}

r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
&max_order);
if (r) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return r;
}

fault->max_level = min(kvm_max_level_for_order(max_order),
fault->max_level);
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);

return RET_PF_CONTINUE;
}

static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
Expand Down Expand Up @@ -4301,6 +4364,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_EMULATE;
}

if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}

if (fault->is_private)
return kvm_faultin_pfn_private(vcpu, fault);

async = false;
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
Expand Down Expand Up @@ -7188,6 +7259,26 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
}

#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range)
{
/*
* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
* can simply ignore such slots. But if userspace is making memory
* PRIVATE, then KVM must prevent the guest from accessing the memory
* as shared. And if userspace is making memory SHARED and this point
* is reached, then at least one page within the range was previously
* PRIVATE, i.e. the slot's possible hugepage ranges are changing.
* Zapping SPTEs in this case ensures KVM will reassess whether or not
* a hugepage can be used for affected ranges.
*/
if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
return false;

return kvm_unmap_gfn_range(kvm, range);
}

static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
int level)
{
Expand Down
1 change: 1 addition & 0 deletions arch/x86/kvm/mmu/mmu_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ struct kvm_page_fault {

/* Derived from mmu and global state. */
const bool is_tdp;
const bool is_private;
const bool nx_huge_page_workaround_enabled;

/*
Expand Down
8 changes: 6 additions & 2 deletions include/linux/kvm_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -2357,14 +2357,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
#define KVM_DIRTY_RING_MAX_ENTRIES 65536

static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
gpa_t gpa, gpa_t size)
gpa_t gpa, gpa_t size,
bool is_write, bool is_exec,
bool is_private)
{
vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
vcpu->run->memory_fault.gpa = gpa;
vcpu->run->memory_fault.size = size;

/* Flags are not (yet) defined or communicated to userspace. */
/* RWX flags are not (yet) defined or communicated to userspace. */
vcpu->run->memory_fault.flags = 0;
if (is_private)
vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}

#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/kvm.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ struct kvm_run {
} notify;
/* KVM_EXIT_MEMORY_FAULT */
struct {
#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3)
__u64 flags;
__u64 gpa;
__u64 size;
Expand Down

0 comments on commit 8dd2eee

Please sign in to comment.