diff --git a/[refs] b/[refs] index d9e77c6bfbe4..5c1a9e72f510 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 5bdeae46be6dfe9efa44a548bd622af325f4bdb4 +refs/heads/master: 8c4ac0949f7779cd4cc8c618f1b07e6113682010 diff --git a/trunk/Documentation/DocBook/kernel-api.tmpl b/trunk/Documentation/DocBook/kernel-api.tmpl index 77436d735013..aa38cc5692a0 100644 --- a/trunk/Documentation/DocBook/kernel-api.tmpl +++ b/trunk/Documentation/DocBook/kernel-api.tmpl @@ -419,13 +419,7 @@ X!Edrivers/pnp/system.c Block Devices -!Eblock/blk-core.c -!Eblock/blk-map.c -!Iblock/blk-sysfs.c -!Eblock/blk-settings.c -!Eblock/blk-exec.c -!Eblock/blk-barrier.c -!Eblock/blk-tag.c +!Eblock/ll_rw_blk.c diff --git a/trunk/Documentation/lguest/lguest.c b/trunk/Documentation/lguest/lguest.c index 6c8a2386cd50..9b0e322118b5 100644 --- a/trunk/Documentation/lguest/lguest.c +++ b/trunk/Documentation/lguest/lguest.c @@ -79,9 +79,6 @@ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; -/* a per-cpu variable indicating whose vcpu is currently running */ -static unsigned int __thread cpu_id; - /* This is our list of devices. */ struct device_list { @@ -156,9 +153,6 @@ struct virtqueue void (*handle_output)(int fd, struct virtqueue *me); }; -/* Remember the arguments to the program so we can "reboot" */ -static char **main_args; - /* Since guest is UP and we don't run at the same time, we don't need barriers. * But I include them in the code in case others copy it. */ #define wmb() @@ -560,7 +554,7 @@ static void wake_parent(int pipefd, int lguest_fd) else FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ - pwrite(lguest_fd, args, sizeof(args), cpu_id); + write(lguest_fd, args, sizeof(args)); } } @@ -1495,9 +1489,7 @@ static void setup_block_file(const char *filename) /* Create stack for thread and run it */ stack = malloc(32768); - /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from - * becoming a zombie. */ - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) + if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) err(1, "Creating clone"); /* We don't need to keep the I/O thread's end of the pipes open. */ @@ -1507,21 +1499,7 @@ static void setup_block_file(const char *filename) verbose("device %u: virtblock %llu sectors\n", devices.device_num, cap); } -/* That's the end of device setup. :*/ - -/* Reboot */ -static void __attribute__((noreturn)) restart_guest(void) -{ - unsigned int i; - - /* Closing pipes causes the waker thread and io_threads to die, and - * closing /dev/lguest cleans up the Guest. Since we don't track all - * open fds, we simply close everything beyond stderr. */ - for (i = 3; i < FD_SETSIZE; i++) - close(i); - execv(main_args[0], main_args); - err(1, "Could not exec %s", main_args[0]); -} +/* That's the end of device setup. */ /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */ @@ -1533,8 +1511,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = pread(lguest_fd, ¬ify_addr, - sizeof(notify_addr), cpu_id); + readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { @@ -1544,23 +1521,16 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; - pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); + read(lguest_fd, reason, sizeof(reason)-1); errx(1, "%s", reason); - /* ERESTART means that we need to reboot the guest */ - } else if (errno == ERESTART) { - restart_guest(); /* EAGAIN means the Waker wanted us to look at some input. * Anything else means a bug or incompatible change. */ } else if (errno != EAGAIN) err(1, "Running guest failed"); - /* Only service input on thread for CPU 0. */ - if (cpu_id != 0) - continue; - /* Service input, then unset the BREAK to release the Waker. */ handle_input(lguest_fd); - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) + if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Resetting break"); } } @@ -1601,12 +1571,6 @@ int main(int argc, char *argv[]) /* If they specify an initrd file to load. */ const char *initrd_name = NULL; - /* Save the args: we "reboot" by execing ourselves again. */ - main_args = argv; - /* We don't "wait" for the children, so prevent them from becoming - * zombies. */ - signal(SIGCHLD, SIG_IGN); - /* First we initialize the device list. Since console and network * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the @@ -1618,7 +1582,6 @@ int main(int argc, char *argv[]) devices.lastdev = &devices.dev; devices.next_irq = 1; - cpu_id = 0; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount diff --git a/trunk/arch/ia64/hp/sim/simscsi.c b/trunk/arch/ia64/hp/sim/simscsi.c index 7661bb065fa5..6ef9b5219930 100644 --- a/trunk/arch/ia64/hp/sim/simscsi.c +++ b/trunk/arch/ia64/hp/sim/simscsi.c @@ -360,6 +360,7 @@ static struct scsi_host_template driver_template = { .max_sectors = 1024, .cmd_per_lun = SIMSCSI_REQ_QUEUE_LEN, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int __init diff --git a/trunk/arch/powerpc/kernel/vio.c b/trunk/arch/powerpc/kernel/vio.c index f0bad7070fb5..19a5656001c0 100644 --- a/trunk/arch/powerpc/kernel/vio.c +++ b/trunk/arch/powerpc/kernel/vio.c @@ -37,6 +37,8 @@ #include #include +extern struct kset devices_subsys; /* needed for vio_find_name() */ + static struct bus_type vio_bus_type; static struct vio_dev vio_bus_device = { /* fake "parent" device */ @@ -359,16 +361,19 @@ EXPORT_SYMBOL(vio_get_attribute); #ifdef CONFIG_PPC_PSERIES /* vio_find_name() - internal because only vio.c knows how we formatted the * kobject name + * XXX once vio_bus_type.devices is actually used as a kset in + * drivers/base/bus.c, this function should be removed in favor of + * "device_find(kobj_name, &vio_bus_type)" */ -static struct vio_dev *vio_find_name(const char *name) +static struct vio_dev *vio_find_name(const char *kobj_name) { - struct device *found; + struct kobject *found; - found = bus_find_device_by_name(&vio_bus_type, NULL, name); + found = kset_find_obj(&devices_subsys, kobj_name); if (!found) return NULL; - return to_vio_dev(found); + return to_vio_dev(container_of(found, struct device, kobj)); } /** diff --git a/trunk/arch/x86/Kconfig b/trunk/arch/x86/Kconfig index 65b449134cf7..fb3eea3e38ee 100644 --- a/trunk/arch/x86/Kconfig +++ b/trunk/arch/x86/Kconfig @@ -107,7 +107,6 @@ config ARCH_SUPPORTS_OPROFILE bool default y -select HAVE_KVM config ZONE_DMA32 bool @@ -1599,6 +1598,4 @@ source "security/Kconfig" source "crypto/Kconfig" -source "arch/x86/kvm/Kconfig" - source "lib/Kconfig" diff --git a/trunk/arch/x86/Makefile b/trunk/arch/x86/Makefile index da8f4129780b..b08f18261df6 100644 --- a/trunk/arch/x86/Makefile +++ b/trunk/arch/x86/Makefile @@ -7,8 +7,6 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -core-$(CONFIG_KVM) += arch/x86/kvm/ - # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o diff --git a/trunk/arch/x86/kvm/irq.h b/trunk/arch/x86/kvm/irq.h deleted file mode 100644 index fa5ed5d59b5d..000000000000 --- a/trunk/arch/x86/kvm/irq.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong - * - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include -#include -#include - -#include "iodev.h" -#include "ioapic.h" -#include "lapic.h" - -struct kvm; -struct kvm_vcpu; - -typedef void irq_request_func(void *opaque, int level); - -struct kvm_kpic_state { - u8 last_irr; /* edge detection */ - u8 irr; /* interrupt request register */ - u8 imr; /* interrupt mask register */ - u8 isr; /* interrupt service register */ - u8 priority_add; /* highest irq priority */ - u8 irq_base; - u8 read_reg_select; - u8 poll; - u8 special_mask; - u8 init_state; - u8 auto_eoi; - u8 rotate_on_auto_eoi; - u8 special_fully_nested_mode; - u8 init4; /* true if 4 byte init */ - u8 elcr; /* PIIX edge/trigger selection */ - u8 elcr_mask; - struct kvm_pic *pics_state; -}; - -struct kvm_pic { - struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; - int output; /* intr from master PIC */ - struct kvm_io_device dev; -}; - -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); -void kvm_pic_update_irq(struct kvm_pic *s); - -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return pic_irqchip(kvm) != NULL; -} - -void kvm_pic_reset(struct kvm_kpic_state *s); - -void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); - -#endif diff --git a/trunk/arch/x86/kvm/lapic.h b/trunk/arch/x86/kvm/lapic.h deleted file mode 100644 index 676c396c9cee..000000000000 --- a/trunk/arch/x86/kvm/lapic.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __KVM_X86_LAPIC_H -#define __KVM_X86_LAPIC_H - -#include "iodev.h" - -#include - -struct kvm_lapic { - unsigned long base_address; - struct kvm_io_device dev; - struct { - atomic_t pending; - s64 period; /* unit: ns */ - u32 divide_count; - ktime_t last_update; - struct hrtimer dev; - } timer; - struct kvm_vcpu *vcpu; - struct page *regs_page; - void *regs; - gpa_t vapic_addr; - struct page *vapic_page; -}; -int kvm_create_lapic(struct kvm_vcpu *vcpu); -void kvm_free_lapic(struct kvm_vcpu *vcpu); - -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); - -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); - -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); -void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); - -#endif diff --git a/trunk/arch/x86/kvm/mmu.c b/trunk/arch/x86/kvm/mmu.c deleted file mode 100644 index 8efdcdbebb03..000000000000 --- a/trunk/arch/x86/kvm/mmu.c +++ /dev/null @@ -1,1885 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay - * Avi Kivity - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "vmx.h" -#include "mmu.h" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#undef MMU_DEBUG - -#undef AUDIT - -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif - -#ifdef MMU_DEBUG - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - -#else - -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) - -#endif - -#if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } -#endif - -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - -#define PT_WRITABLE_SHIFT 1 - -#define PT_PRESENT_MASK (1ULL << 0) -#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) -#define PT_PWT_MASK (1ULL << 3) -#define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) -#define PT_PAT_MASK (1ULL << 7) -#define PT_GLOBAL_MASK (1ULL << 8) -#define PT64_NX_SHIFT 63 -#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) - -#define PT_PAT_SHIFT 7 -#define PT_DIR_PAT_SHIFT 12 -#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) - -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - - -#define PT_FIRST_AVAIL_BITS_SHIFT 9 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 - -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) - -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) - -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT64_ROOT_LEVEL 4 -#define PT32_ROOT_LEVEL 2 -#define PT32E_ROOT_LEVEL 3 - -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - -#define RMAP_EXT 4 - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; - struct kvm_rmap_desc *more; -}; - -static struct kmem_cache *pte_chain_cache; -static struct kmem_cache *rmap_desc_cache; -static struct kmem_cache *mmu_page_header_cache; - -static u64 __read_mostly shadow_trap_nonpresent_pte; -static u64 __read_mostly shadow_notrap_nonpresent_pte; - -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) -{ - shadow_trap_nonpresent_pte = trap_pte; - shadow_notrap_nonpresent_pte = notrap_pte; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); - -static int is_write_protection(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_WP; -} - -static int is_cpuid_PSE36(void) -{ - return 1; -} - -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.shadow_efer & EFER_NX; -} - -static int is_present_pte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - -static int is_shadow_present_pte(u64 pte) -{ - pte &= ~PT_SHADOW_IO_MARK; - return pte != shadow_trap_nonpresent_pte - && pte != shadow_notrap_nonpresent_pte; -} - -static int is_writeble_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -static int is_dirty_pte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - -static int is_io_pte(unsigned long pte) -{ - return pte & PT_SHADOW_IO_MARK; -} - -static int is_rmap_pte(u64 pte) -{ - return pte != shadow_trap_nonpresent_pte - && pte != shadow_notrap_nonpresent_pte; -} - -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - -static void set_shadow_pte(u64 *sptep, u64 spte) -{ -#ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); -#else - set_64bit((unsigned long long *)sptep, spte); -#endif -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); - if (!obj) - return -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - struct page *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - set_page_private(page, 0); - cache->objects[cache->nobjs++] = page_address(page); - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) -{ - int r; - - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, - pte_chain_cache, 4); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 1); - if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; -} - -static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - memset(p, 0, size); - return p; -} - -static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, - sizeof(struct kvm_pte_chain)); -} - -static void mmu_free_pte_chain(struct kvm_pte_chain *pc) -{ - kfree(pc); -} - -static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, - sizeof(struct kvm_rmap_desc)); -} - -static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) -{ - kfree(rd); -} - -/* - * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called - */ - -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); - return &slot->rmap[gfn - slot->base_gfn]; -} - -/* - * Reverse mapping data structures: - * - * If rmapp bit zero is zero, then rmapp point to the shadw page table entry - * that points to page_address(page). - * - * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc - * containing more mappings. - */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct kvm_rmap_desc *desc; - unsigned long *rmapp; - int i; - - if (!is_rmap_pte(*spte)) - return; - gfn = unalias_gfn(vcpu->kvm, gfn); - sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn); - if (!*rmapp) { - rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - *rmapp = (unsigned long)spte; - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; - *rmapp = (unsigned long)desc | 1; - } else { - rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) - desc = desc->more; - if (desc->shadow_ptes[RMAP_EXT-1]) { - desc->more = mmu_alloc_rmap_desc(vcpu); - desc = desc->more; - } - for (i = 0; desc->shadow_ptes[i]; ++i) - ; - desc->shadow_ptes[i] = spte; - } -} - -static void rmap_desc_remove_entry(unsigned long *rmapp, - struct kvm_rmap_desc *desc, - int i, - struct kvm_rmap_desc *prev_desc) -{ - int j; - - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) - ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; - if (j != 0) - return; - if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; - else - if (prev_desc) - prev_desc->more = desc->more; - else - *rmapp = (unsigned long)desc->more | 1; - mmu_free_rmap_desc(desc); -} - -static void rmap_remove(struct kvm *kvm, u64 *spte) -{ - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - struct kvm_mmu_page *sp; - struct page *page; - unsigned long *rmapp; - int i; - - if (!is_rmap_pte(*spte)) - return; - sp = page_header(__pa(spte)); - page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - mark_page_accessed(page); - if (is_writeble_pte(*spte)) - kvm_release_page_dirty(page); - else - kvm_release_page_clean(page); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); - if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); - BUG(); - } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); - if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); - BUG(); - } - *rmapp = 0; - } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { - rmap_desc_remove_entry(rmapp, - desc, i, - prev_desc); - return; - } - prev_desc = desc; - desc = desc->more; - } - BUG(); - } -} - -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -{ - struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; - u64 *prev_spte; - int i; - - if (!*rmapp) - return NULL; - else if (!(*rmapp & 1)) { - if (!spte) - return (u64 *)*rmapp; - return NULL; - } - desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; - prev_spte = NULL; - while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { - if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; - } - desc = desc->more; - } - return NULL; -} - -static void rmap_write_protect(struct kvm *kvm, u64 gfn) -{ - unsigned long *rmapp; - u64 *spte; - int write_protected = 0; - - gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn); - - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!spte); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writeble_pte(*spte)) { - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; - } - spte = rmap_next(kvm, rmapp, spte); - } - if (write_protected) - kvm_flush_remote_tlbs(kvm); -} - -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) -{ - u64 *pos; - u64 *end; - - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { - printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, - pos, *pos); - return 0; - } - return 1; -} -#endif - -static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - ASSERT(is_empty_shadow_page(sp->spt)); - list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); - kfree(sp); - ++kvm->arch.n_free_mmu_pages; -} - -static unsigned kvm_page_table_hashfn(gfn_t gfn) -{ - return gfn; -} - -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) -{ - struct kvm_mmu_page *sp; - - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; - sp->multimapped = 0; - sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; - return sp; -} - -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!parent_pte) - return; - if (!sp->multimapped) { - u64 *old = sp->parent_pte; - - if (!old) { - sp->parent_pte = parent_pte; - return; - } - sp->multimapped = 1; - pte_chain = mmu_alloc_pte_chain(vcpu); - INIT_HLIST_HEAD(&sp->parent_ptes); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = old; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { - if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) - continue; - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) - if (!pte_chain->parent_ptes[i]) { - pte_chain->parent_ptes[i] = parent_pte; - return; - } - } - pte_chain = mmu_alloc_pte_chain(vcpu); - BUG_ON(!pte_chain); - hlist_add_head(&pte_chain->link, &sp->parent_ptes); - pte_chain->parent_ptes[0] = parent_pte; -} - -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, - u64 *parent_pte) -{ - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; - - if (!sp->multimapped) { - BUG_ON(sp->parent_pte != parent_pte); - sp->parent_pte = NULL; - return; - } - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - if (pte_chain->parent_ptes[i] != parent_pte) - continue; - while (i + 1 < NR_PTE_CHAIN_ENTRIES - && pte_chain->parent_ptes[i + 1]) { - pte_chain->parent_ptes[i] - = pte_chain->parent_ptes[i + 1]; - ++i; - } - pte_chain->parent_ptes[i] = NULL; - if (i == 0) { - hlist_del(&pte_chain->link); - mmu_free_pte_chain(pte_chain); - if (hlist_empty(&sp->parent_ptes)) { - sp->multimapped = 0; - sp->parent_pte = NULL; - } - } - return; - } - BUG(); -} - -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { - pgprintk("%s: found role %x\n", - __FUNCTION__, sp->role.word); - return sp; - } - return NULL; -} - -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int metaphysical, - unsigned access, - u64 *parent_pte, - bool *new_page) -{ - union kvm_mmu_page_role role; - unsigned index; - unsigned quadrant; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - role.word = 0; - role.glevels = vcpu->arch.mmu.root_level; - role.level = level; - role.metaphysical = metaphysical; - role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, - gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - pgprintk("%s: found\n", __FUNCTION__); - return sp; - } - ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte); - if (!sp) - return sp; - pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); - sp->gfn = gfn; - sp->role = role; - hlist_add_head(&sp->hash_link, bucket); - vcpu->arch.mmu.prefetch_page(vcpu, sp); - if (!metaphysical) - rmap_write_protect(vcpu->kvm, gfn); - if (new_page) - *new_page = 1; - return sp; -} - -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) -{ - unsigned i; - u64 *pt; - u64 ent; - - pt = sp->spt; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - kvm_flush_remote_tlbs(kvm); - return; - } - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - ent = pt[i]; - - pt[i] = shadow_trap_nonpresent_pte; - if (!is_shadow_present_pte(ent)) - continue; - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); - } - kvm_flush_remote_tlbs(kvm); -} - -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - -static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) -{ - int i; - - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; -} - -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - u64 *parent_pte; - - ++kvm->stat.mmu_shadow_zapped; - while (sp->multimapped || sp->parent_pte) { - if (!sp->multimapped) - parent_pte = sp->parent_pte; - else { - struct kvm_pte_chain *chain; - - chain = container_of(sp->parent_ptes.first, - struct kvm_pte_chain, link); - parent_pte = chain->parent_ptes[0]; - } - BUG_ON(!parent_pte); - kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); - } - kvm_mmu_page_unlink_children(kvm, sp); - if (!sp->root_count) { - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); - } else - list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_mmu_reset_last_pte_updated(kvm); -} - -/* - * Changing the number of mmu pages allocated to the vm - * Note: if kvm_nr_mmu_pages is too small, you will get dead lock - */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) -{ - /* - * If we set the number of mmu pages to be smaller be than the - * number of actived pages , we must to free some mmu pages before we - * change the value - */ - - if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > - kvm_nr_mmu_pages) { - int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - - while (n_used_mmu_pages > kvm_nr_mmu_pages) { - struct kvm_mmu_page *page; - - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); - n_used_mmu_pages--; - } - kvm->arch.n_free_mmu_pages = 0; - } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; - - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; -} - -static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node, *n; - int r; - - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { - pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, - sp->role.word); - kvm_mmu_zap_page(kvm, sp); - r = 1; - } - return r; -} - -static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - - while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); - } -} - -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, &sp->slot_bitmap); -} - -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - - if (gpa == UNMAPPED_GVA) - return NULL; - return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); -} - -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn, struct page *page) -{ - u64 spte; - int was_rmapped = is_rmap_pte(*shadow_pte); - int was_writeble = is_writeble_pte(*shadow_pte); - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __FUNCTION__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); - - /* - * We don't set the accessed bit, since we sometimes want to see - * whether the guest actually used the pte (in order to detect - * demand paging). - */ - spte = PT_PRESENT_MASK | PT_DIRTY_MASK; - if (!dirty) - pte_access &= ~ACC_WRITE_MASK; - if (!(pte_access & ACC_EXEC_MASK)) - spte |= PT64_NX_MASK; - - spte |= PT_PRESENT_MASK; - if (pte_access & ACC_USER_MASK) - spte |= PT_USER_MASK; - - if (is_error_page(page)) { - set_shadow_pte(shadow_pte, - shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); - kvm_release_page_clean(page); - return; - } - - spte |= page_to_phys(page); - - if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; - - spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; - } - - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); - pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { - spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } - if (write_fault) - *ptwrite = 1; - } - } - -unshadowed: - - if (pte_access & ACC_WRITE_MASK) - mark_page_dirty(vcpu->kvm, gfn); - - pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); - set_shadow_pte(shadow_pte, spte); - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); - if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn); - if (!is_rmap_pte(*shadow_pte)) - kvm_release_page_clean(page); - } else { - if (was_writeble) - kvm_release_page_dirty(page); - else - kvm_release_page_clean(page); - } - if (!ptwrite || !*ptwrite) - vcpu->arch.last_pte_updated = shadow_pte; -} - -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ -} - -static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, - gfn_t gfn, struct page *page) -{ - int level = PT32E_ROOT_LEVEL; - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; - - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); - - if (level == 1) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn, page); - return pt_write || is_io_pte(table[index]); - } - - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index], - NULL); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_page_clean(page); - return -ENOMEM; - } - - table[index] = __pa(new_table->spt) | PT_PRESENT_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; - } -} - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -{ - int r; - - struct page *page; - - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gfn); - - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - r = __nonpaging_map(vcpu, v, write, gfn, page); - spin_unlock(&vcpu->kvm->mmu_lock); - - up_read(¤t->mm->mmap_sem); - - return r; -} - - -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; -} - -static void mmu_free_roots(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - spin_lock(&vcpu->kvm->mmu_lock); -#ifdef CONFIG_X86_64 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - sp = page_header(root); - --sp->root_count; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - spin_unlock(&vcpu->kvm->mmu_lock); - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; - } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - } - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; -} - -static void mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - int i; - gfn_t root_gfn; - struct kvm_mmu_page *sp; - - root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; - -#ifdef CONFIG_X86_64 - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - ASSERT(!VALID_PAGE(root)); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - vcpu->arch.mmu.root_hpa = root; - return; - } -#endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { - vcpu->arch.mmu.pae_root[i] = 0; - continue; - } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; - } else if (vcpu->arch.mmu.root_level == 0) - root_gfn = 0; - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), - ACC_ALL, NULL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; - } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); -} - -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - return vaddr; -} - -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) -{ - gfn_t gfn; - int r; - - pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - gfn = gva >> PAGE_SHIFT; - - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn); -} - -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - context->new_cr3 = nonpaging_new_cr3; - context->page_fault = nonpaging_page_fault; - context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; - context->prefetch_page = nonpaging_prefetch_page; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); -} - -static void paging_new_cr3(struct kvm_vcpu *vcpu) -{ - pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); - mmu_free_roots(vcpu); -} - -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) -{ - kvm_inject_page_fault(vcpu, addr, err_code); -} - -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - -#define PTTYPE 64 -#include "paging_tmpl.h" -#undef PTTYPE - -#define PTTYPE 32 -#include "paging_tmpl.h" -#undef PTTYPE - -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; - context->page_fault = paging64_page_fault; - context->gva_to_gpa = paging64_gva_to_gpa; - context->prefetch_page = paging64_prefetch_page; - context->free = paging_free; - context->root_level = level; - context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging64_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); -} - -static int paging32_init_context(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = &vcpu->arch.mmu; - - context->new_cr3 = paging_new_cr3; - context->page_fault = paging32_page_fault; - context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; - context->prefetch_page = paging32_prefetch_page; - context->root_level = PT32_ROOT_LEVEL; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - return 0; -} - -static int paging32E_init_context(struct kvm_vcpu *vcpu) -{ - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); -} - -static int init_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - if (!is_paging(vcpu)) - return nonpaging_init_context(vcpu); - else if (is_long_mode(vcpu)) - return paging64_init_context(vcpu); - else if (is_pae(vcpu)) - return paging32E_init_context(vcpu); - else - return paging32_init_context(vcpu); -} - -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { - vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } -} - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); - -int kvm_mmu_load(struct kvm_vcpu *vcpu) -{ - int r; - - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - mmu_alloc_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); -out: - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_load); - -void kvm_mmu_unload(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte) -{ - u64 pte; - struct kvm_mmu_page *child; - - pte = *spte; - if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL) - rmap_remove(vcpu->kvm, spte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, spte); - } - } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); -} - -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, - const void *new, int bytes, - int offset_in_pte) -{ - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); - else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); -} - -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) - return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; - return (old & ~new & PT64_PERM_MASK) != 0; -} - -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) -{ - if (need_remote_flush(old, new)) - kvm_flush_remote_tlbs(vcpu->kvm); - else - kvm_mmu_flush_tlb(vcpu); -} - -static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) -{ - u64 *spte = vcpu->arch.last_pte_updated; - - return !!(spte && (*spte & PT_ACCESSED_MASK)); -} - -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) -{ - gfn_t gfn; - int r; - u64 gpte = 0; - - if (bytes != 4 && bytes != 8) - return; - - /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). - */ - if (is_pae(vcpu)) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if ((bytes == 4) && (gpa % 4 == 0)) { - r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); - if (r) - return; - memcpy((void *)&gpte + (gpa % 8), new, 4); - } else if ((bytes == 8) && (gpa % 8 == 0)) { - memcpy((void *)&gpte, new, 8); - } - } else { - if ((bytes == 4) && (gpa % 4 == 0)) - memcpy((void *)&gpte, new, 4); - } - if (!is_present_pte(gpte)) - return; - gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - vcpu->arch.update_pte.gfn = gfn; - vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); -} - -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - struct kvm_mmu_page *sp; - struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; - u64 entry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - - pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); - mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) - continue; - pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; - if (misaligned || flooded) { - /* - * Misaligned accesses are too much trouble to fix - * up; also, they usually indicate a page is not used - * as a page table. - * - * If we're seeing too many writes to a page, - * it may no longer be a page table, or we may be - * forking, in which case it is better to unmap the - * page. - */ - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); - ++vcpu->kvm->stat.mmu_flooded; - continue; - } - page_offset = offset; - level = sp->role.level; - npte = 1; - if (sp->role.glevels == PT32_ROOT_LEVEL) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - npte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != sp->role.quadrant) - continue; - } - spte = &sp->spt[page_offset / sizeof(*spte)]; - while (npte--) { - entry = *spte; - mmu_pte_write_zap_pte(vcpu, sp, spte); - mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); - ++spte; - } - } - kvm_mmu_audit(vcpu, "post pte write"); - spin_unlock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.update_pte.page) { - kvm_release_page_clean(vcpu->arch.update_pte.page); - vcpu->arch.update_pte.page = NULL; - } -} - -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - down_read(¤t->mm->mmap_sem); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - up_read(¤t->mm->mmap_sem); - - spin_lock(&vcpu->kvm->mmu_lock); - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - spin_unlock(&vcpu->kvm->mmu_lock); - return r; -} - -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { - struct kvm_mmu_page *sp; - - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - ++vcpu->kvm->stat.mmu_recycled; - } -} - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) -{ - int r; - enum emulation_result er; - - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); - if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } - - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - - er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_DO_MMIO: - ++vcpu->stat.mmio_exits; - return 0; - case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; - default: - BUG(); - } -out: - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); - -static void free_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - - while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); - } - free_page((unsigned long)vcpu->arch.mmu.pae_root); -} - -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct page *page; - int i; - - ASSERT(vcpu); - - if (vcpu->kvm->arch.n_requested_mmu_pages) - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_requested_mmu_pages; - else - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_alloc_mmu_pages; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->arch.mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - - return 0; - -error_1: - free_mmu_pages(vcpu); - return -ENOMEM; -} - -int kvm_mmu_create(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - return alloc_mmu_pages(vcpu); -} - -int kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - return init_kvm_mmu(vcpu); -} - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; - - if (!test_bit(slot, &sp->slot_bitmap)) - continue; - - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) - pt[i] &= ~PT_WRITABLE_MASK; - } -} - -void kvm_mmu_zap_all(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - - spin_lock(&kvm->mmu_lock); - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); - spin_unlock(&kvm->mmu_lock); - - kvm_flush_remote_tlbs(kvm); -} - -void kvm_mmu_module_exit(void) -{ - if (pte_chain_cache) - kmem_cache_destroy(pte_chain_cache); - if (rmap_desc_cache) - kmem_cache_destroy(rmap_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_chain_cache = kmem_cache_create("kvm_pte_chain", - sizeof(struct kvm_pte_chain), - 0, 0, NULL); - if (!pte_chain_cache) - goto nomem; - rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", - sizeof(struct kvm_rmap_desc), - 0, 0, NULL); - if (!rmap_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - return 0; - -nomem: - kvm_mmu_module_exit(); - return -ENOMEM; -} - -/* - * Caculate mmu pages needed for kvm. - */ -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) -{ - int i; - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; - - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - -#ifdef AUDIT - -static const char *audit_msg; - -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; -#endif - return gva; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; - - if (ent == shadow_trap_nonpresent_pte) - continue; - - va = canonicalize(va); - if (level > 1) { - if (ent == shadow_notrap_nonpresent_pte) - printk(KERN_ERR "audit: (%s) nontrapping pte" - " in nonleaf level: levels %d gva %lx" - " level %d pte %llx\n", audit_msg, - vcpu->arch.mmu.root_level, va, level, ent); - - audit_mappings_page(vcpu, ent, va, level - 1); - } else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - struct page *page = gpa_to_page(vcpu, gpa); - hpa_t hpa = page_to_phys(page); - - if (is_shadow_present_pte(ent) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx %d\n", - audit_msg, vcpu->arch.mmu.root_level, - va, gpa, hpa, ent, - is_shadow_present_pte(ent)); - else if (ent == shadow_notrap_nonpresent_pte - && !is_error_hpa(hpa)) - printk(KERN_ERR "audit: (%s) notrap shadow," - " valid guest gva %lx\n", audit_msg, va); - kvm_release_page_clean(page); - - } - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); -} - -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - int i, j, k; - - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - unsigned long *rmapp = &m->rmap[j]; - - if (!*rmapp) - continue; - if (!(*rmapp & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - return nmaps; -} - -static int count_writable_mappings(struct kvm_vcpu *vcpu) -{ - int nmaps = 0; - struct kvm_mmu_page *sp; - int i; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!(ent & PT_WRITABLE_MASK)) - continue; - ++nmaps; - } - } - return nmaps; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __FUNCTION__, audit_msg, n_rmap, n_actual); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - struct kvm_memory_slot *slot; - unsigned long *rmapp; - gfn_t gfn; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.metaphysical) - continue; - - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", - __FUNCTION__, audit_msg, sp->gfn, - sp->role.word); - } -} - -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) -{ - int olddbg = dbg; - - dbg = 0; - audit_msg = msg; - audit_rmap(vcpu); - audit_write_protection(vcpu); - audit_mappings(vcpu); - dbg = olddbg; -} - -#endif diff --git a/trunk/arch/x86/kvm/mmu.h b/trunk/arch/x86/kvm/mmu.h deleted file mode 100644 index 1fce19ec7a23..000000000000 --- a/trunk/arch/x86/kvm/mmu.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef __KVM_X86_MMU_H -#define __KVM_X86_MMU_H - -#include - -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); -} - -static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) -{ - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) - return 0; - - return kvm_mmu_load(vcpu); -} - -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.shadow_efer & EFER_LME; -#else - return 0; -#endif -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PAE; -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr4 & X86_CR4_PSE; -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.cr0 & X86_CR0_PG; -} - -#endif diff --git a/trunk/arch/x86/kvm/paging_tmpl.h b/trunk/arch/x86/kvm/paging_tmpl.h deleted file mode 100644 index 03ba8608fe0f..000000000000 --- a/trunk/arch/x86/kvm/paging_tmpl.h +++ /dev/null @@ -1,484 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Yaniv Kamay - * Avi Kivity - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -/* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. - */ - -#if PTTYPE == 64 - #define pt_element_t u64 - #define guest_walker guest_walker64 - #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS - #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 - #define CMPXCHG cmpxchg - #else - #define CMPXCHG cmpxchg64 - #define PT_MAX_FULL_LEVELS 2 - #endif -#elif PTTYPE == 32 - #define pt_element_t u32 - #define guest_walker guest_walker32 - #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS - #define PT_MAX_FULL_LEVELS 2 - #define CMPXCHG cmpxchg -#else - #error Invalid PTTYPE value -#endif - -#define gpte_to_gfn FNAME(gpte_to_gfn) -#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) - -/* - * The guest_walker structure emulates the behavior of the hardware page - * table walker. - */ -struct guest_walker { - int level; - gfn_t table_gfn[PT_MAX_FULL_LEVELS]; - pt_element_t ptes[PT_MAX_FULL_LEVELS]; - gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; - gfn_t gfn; - u32 error_code; -}; - -static gfn_t gpte_to_gfn(pt_element_t gpte) -{ - return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t gpte_to_gfn_pde(pt_element_t gpte) -{ - return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - pt_element_t ret; - pt_element_t *table; - struct page *page; - - page = gfn_to_page(kvm, table_gfn); - table = kmap_atomic(page, KM_USER0); - - ret = CMPXCHG(&table[index], orig_pte, new_pte); - - kunmap_atomic(table, KM_USER0); - - kvm_release_page_dirty(page); - - return (ret != orig_pte); -} - -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; -#if PTTYPE == 64 - if (is_nx(vcpu)) - access &= ~(gpte >> PT64_NX_SHIFT); -#endif - return access; -} - -/* - * Fetch a guest pte for a guest virtual address - */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) -{ - pt_element_t pte; - gfn_t table_gfn; - unsigned index, pt_access, pte_access; - gpa_t pte_gpa; - - pgprintk("%s: addr %lx\n", __FUNCTION__, addr); -walk: - walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.cr3; -#if PTTYPE == 64 - if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; - if (!is_present_pte(pte)) - goto not_present; - --walker->level; - } -#endif - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); - - pt_access = ACC_ALL; - - for (;;) { - index = PT_INDEX(addr, walker->level); - - table_gfn = gpte_to_gfn(pte); - pte_gpa = gfn_to_gpa(table_gfn); - pte_gpa += index * sizeof(pt_element_t); - walker->table_gfn[walker->level - 1] = table_gfn; - walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, - walker->level - 1, table_gfn); - - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); - - if (!is_present_pte(pte)) - goto not_present; - - if (write_fault && !is_writeble_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - goto access_error; - - if (user_fault && !(pte & PT_USER_MASK)) - goto access_error; - -#if PTTYPE == 64 - if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) - goto access_error; -#endif - - if (!(pte & PT_ACCESSED_MASK)) { - mark_page_dirty(vcpu->kvm, table_gfn); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) - goto walk; - pte |= PT_ACCESSED_MASK; - } - - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); - - walker->ptes[walker->level - 1] = pte; - - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = gpte_to_gfn(pte); - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (pte & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = gpte_to_gfn_pde(pte); - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - if (PTTYPE == 32 && is_cpuid_PSE36()) - walker->gfn += pse36_gfn_delta(pte); - break; - } - - pt_access = pte_access; - --walker->level; - } - - if (write_fault && !is_dirty_pte(pte)) { - bool ret; - - mark_page_dirty(vcpu->kvm, table_gfn); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret) - goto walk; - pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); - walker->ptes[walker->level - 1] = pte; - } - - walker->pt_access = pt_access; - walker->pte_access = pte_access; - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __FUNCTION__, (u64)pte, pt_access, pte_access); - return 1; - -not_present: - walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) - walker->error_code |= PFERR_FETCH_MASK; - return 0; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes, - int offset_in_pte) -{ - pt_element_t gpte; - unsigned pte_access; - struct page *npage; - - gpte = *(const pt_element_t *)pte; - if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!offset_in_pte && !is_present_pte(gpte)) - set_shadow_pte(spte, shadow_notrap_nonpresent_pte); - return; - } - if (bytes < sizeof(pt_element_t)) - return; - pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); - if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) - return; - npage = vcpu->arch.update_pte.page; - if (!npage) - return; - get_page(npage); - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); -} - -/* - * Fetch a shadow pte for a specific level in the paging hierarchy. - */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite, - struct page *page) -{ - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; - } - - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - bool new_page = 0; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - if (is_shadow_present_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent, &new_page); - if (new_page && !metaphysical) { - int r; - pt_element_t curr_pte; - r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_page_clean(page); - return NULL; - } - } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; - } - - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn, page); - - return shadow_ent; -} - -/* - * Page fault handler. There are several causes for a page fault: - * - there is no shadow pte for the guest pte - * - write access through a shadow pte marked read only so that we can set - * the dirty bit - * - write access to a shadow pte marked read only so we can update the page - * dirty bitmap, when userspace requests it - * - mmio access; in this case we will never install a present shadow pte - * - normal guest page fault due to the guest pte marked not present, not - * writable, or not executable - * - * Returns: 1 if we need to emulate the instruction, 0 otherwise, or - * a negative value on error. - */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, - u32 error_code) -{ - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; - struct guest_walker walker; - u64 *shadow_pte; - int write_pt = 0; - int r; - struct page *page; - - pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - down_read(¤t->mm->mmap_sem); - /* - * Look up the shadow pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); - - /* - * The page is not mapped by the guest. Let the guest handle it. - */ - if (!r) { - pgprintk("%s: guest page fault\n", __FUNCTION__); - inject_page_fault(vcpu, addr, walker.error_code); - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - up_read(¤t->mm->mmap_sem); - return 0; - } - - page = gfn_to_page(vcpu->kvm, walker.gfn); - - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt, page); - pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, - shadow_pte, *shadow_pte, write_pt); - - if (!write_pt) - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (shadow_pte && is_io_pte(*shadow_pte)) { - spin_unlock(&vcpu->kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); - return 1; - } - - ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); - spin_unlock(&vcpu->kvm->mmu_lock); - up_read(¤t->mm->mmap_sem); - - return write_pt; -} - -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; - } - - return gpa; -} - -static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i, offset = 0, r = 0; - pt_element_t pt; - - if (sp->role.metaphysical - || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { - nonpaging_prefetch_page(vcpu, sp); - return; - } - - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - gpa_t pte_gpa = gfn_to_gpa(sp->gfn); - pte_gpa += (i+offset) * sizeof(pt_element_t); - - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, - sizeof(pt_element_t)); - if (r || is_present_pte(pt)) - sp->spt[i] = shadow_trap_nonpresent_pte; - else - sp->spt[i] = shadow_notrap_nonpresent_pte; - } -} - -#undef pt_element_t -#undef guest_walker -#undef FNAME -#undef PT_BASE_ADDR_MASK -#undef PT_INDEX -#undef SHADOW_PT_INDEX -#undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK -#undef PT_LEVEL_BITS -#undef PT_MAX_FULL_LEVELS -#undef gpte_to_gfn -#undef gpte_to_gfn_pde -#undef CMPXCHG diff --git a/trunk/arch/x86/kvm/x86_emulate.c b/trunk/arch/x86/kvm/x86_emulate.c deleted file mode 100644 index 79586003397a..000000000000 --- a/trunk/arch/x86/kvm/x86_emulate.c +++ /dev/null @@ -1,1912 +0,0 @@ -/****************************************************************************** - * x86_emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * - * Avi Kivity - * Yaniv Kamay - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __KERNEL__ -#include -#include -#include -#define DPRINTF(_f, _a ...) printf(_f , ## _a) -#else -#include -#define DPRINTF(x...) do {} while (0) -#endif -#include -#include - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov ,' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) -/* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) -/* Generic ModRM decode. */ -#define ModRM (1<<6) -/* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ - -static u16 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - 0, 0, ImplicitOps | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x78 - 0x7F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, - 0, 0, 0, 0, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps, ImplicitOps, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM -}; - -static u16 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* EFLAGS bit definitions. */ -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"b %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"w %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"l %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ - } while (0) - -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK)); \ - } while (0) - -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"q %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -/* Access/update address held in a register, based on addressing mode. */ -#define address_mask(reg) \ - ((c->ad_bytes == sizeof(unsigned long)) ? \ - (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) -#define register_address(base, reg) \ - ((base) + address_mask(reg)) -#define register_address_increment(reg, inc) \ - do { \ - /* signed type ensures sign extension to long */ \ - int _inc = (inc); \ - if (c->ad_bytes == sizeof(unsigned long)) \ - (reg) += _inc; \ - else \ - (reg) = ((reg) & \ - ~((1UL << (c->ad_bytes << 3)) - 1)) | \ - (((reg) + _inc) & \ - ((1UL << (c->ad_bytes << 3)) - 1)); \ - } while (0) - -#define JMP_REL(rel) \ - do { \ - register_address_increment(c->eip, rel); \ - } while (0) - -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) -{ - struct fetch_cache *fc = &ctxt->decode.fetch; - int rc; - int size; - - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) - return rc; - fc->start = linear; - fc->end = linear + size; - } - *dest = fc->data[linear - fc->start]; - return 0; -} - -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long eip, void *dest, unsigned size) -{ - int rc = 0; - - eip += ctxt->cs_base; - while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) - return rc; - } - return 0; -} - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *ptr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) - return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -static void decode_register_operand(struct operand *op, - struct decode_cache *c, - int inhibit_bytereg) -{ - unsigned reg = c->modrm_reg; - int highbyte_regs = c->rex_prefix == 0; - - if (!(c->d & ModRM)) - reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); - op->type = OP_REG; - if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; - op->bytes = 1; - } else { - op->ptr = decode_register(reg, c->regs, 0); - op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } - } - op->orig_val = op->val; -} - -static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - u8 sib; - int index_reg = 0, base_reg = 0, scale, rip_relative = 0; - int rc = 0; - - if (c->rex_prefix) { - c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ - c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ - } - - c->modrm = insn_fetch(u8, 1, c->eip); - c->modrm_mod |= (c->modrm & 0xc0) >> 6; - c->modrm_reg |= (c->modrm & 0x38) >> 3; - c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; - - if (c->modrm_mod == 3) { - c->modrm_val = *(unsigned long *) - decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - return rc; - } - - if (c->ad_bytes == 2) { - unsigned bx = c->regs[VCPU_REGS_RBX]; - unsigned bp = c->regs[VCPU_REGS_RBP]; - unsigned si = c->regs[VCPU_REGS_RSI]; - unsigned di = c->regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - } - switch (c->modrm_rm) { - case 0: - c->modrm_ea += bx + si; - break; - case 1: - c->modrm_ea += bx + di; - break; - case 2: - c->modrm_ea += bp + si; - break; - case 3: - c->modrm_ea += bp + di; - break; - case 4: - c->modrm_ea += si; - break; - case 5: - c->modrm_ea += di; - break; - case 6: - if (c->modrm_mod != 0) - c->modrm_ea += bp; - break; - case 7: - c->modrm_ea += bx; - break; - } - if (c->modrm_rm == 2 || c->modrm_rm == 3 || - (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->override_base) - c->override_base = &ctxt->ss_base; - c->modrm_ea = (u16)c->modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - switch (c->modrm_rm) { - case 4: - case 12: - sib = insn_fetch(u8, 1, c->eip); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - switch (base_reg) { - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[base_reg]; - else - c->modrm_ea += - insn_fetch(s32, 4, c->eip); - break; - default: - c->modrm_ea += c->regs[base_reg]; - } - switch (index_reg) { - case 4: - break; - default: - c->modrm_ea += c->regs[index_reg] << scale; - } - break; - case 5: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[c->modrm_rm]; - else if (ctxt->mode == X86EMUL_MODE_PROT64) - rip_relative = 1; - break; - default: - c->modrm_ea += c->regs[c->modrm_rm]; - break; - } - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - } - } - if (rip_relative) { - c->modrm_ea += c->eip; - switch (c->d & SrcMask) { - case SrcImmByte: - c->modrm_ea += 1; - break; - case SrcImm: - if (c->d & ByteOp) - c->modrm_ea += 1; - else - if (c->op_bytes == 8) - c->modrm_ea += 4; - else - c->modrm_ea += c->op_bytes; - } - } -done: - return rc; -} - -static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->ad_bytes) { - case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); - break; - case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); - break; - case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); - break; - } -done: - return rc; -} - -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes; - - /* Shadow copy of register state. Committed on successful emulation. */ - - memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x2e: /* CS override */ - c->override_base = &ctxt->cs_base; - break; - case 0x3e: /* DS override */ - c->override_base = &ctxt->ds_base; - break; - case 0x26: /* ES override */ - c->override_base = &ctxt->es_base; - break; - case 0x64: /* FS override */ - c->override_base = &ctxt->fs_base; - break; - case 0x65: /* GS override */ - c->override_base = &ctxt->gs_base; - break; - case 0x36: /* SS override */ - c->override_base = &ctxt->ss_base; - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc) - goto done; - - if (!c->override_base) - c->override_base = &ctxt->ds_base; - if (mode == X86EMUL_MODE_PROT64 && - c->override_base != &ctxt->fs_base && - c->override_base != &ctxt->gs_base) - c->override_base = NULL; - - if (c->override_base) - c->modrm_ea += *c->override_base; - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - break; - } - c->src.type = OP_MEM; - break; - case SrcImm: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - break; - case SrcImmByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - c->src.val = insn_fetch(s8, 1, c->eip); - break; - } - - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.type = OP_REG; - break; - } - c->dst.type = OP_MEM; - break; - } - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]); -} - -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = ops->read_std(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), - &c->dst.val, c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - - register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); - - return 0; -} - -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - switch (c->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); - break; - case 1: /* ror */ - emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); - break; - case 2: /* rcl */ - emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); - break; - case 3: /* rcr */ - emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); - break; - case 5: /* shr */ - emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); - break; - case 7: /* sar */ - emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); - break; - } -} - -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->modrm_reg) { - case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 2: /* not */ - c->dst.val = ~c->dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", c->dst, ctxt->eflags); - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; - } -done: - return rc; -} - -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - switch (c->modrm_reg) { - case 0: /* inc */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 1: /* dec */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 4: /* jmp abs */ - if (c->b == 0xff) - c->eip = c->dst.val; - else { - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } - break; - case 6: /* push */ - - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - - if (ctxt->mode == X86EMUL_MODE_PROT64) { - c->dst.bytes = 8; - rc = ops->read_std((unsigned long)c->dst.ptr, - &c->dst.val, 8, ctxt->vcpu); - if (rc != 0) - return rc; - } - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->dst.bytes); - rc = ops->write_emulated(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - c->dst.type = OP_NONE; - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } - return 0; -} - -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) -{ - struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) - return rc; - - if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; - - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; - - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) - return rc; - ctxt->eflags |= EFLG_ZF; - } - return 0; -} - -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != 0) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return 0; -} - -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - unsigned long memop = 0; - u64 msr_data; - unsigned long saved_eip = 0; - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; - - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; - - if (c->rep_prefix && (c->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; - goto done; - } - } - c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; - } - - if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); - if (rc != 0) - goto done; - c->src.orig_val = c->src.val; - } - - if ((c->d & DstMask) == ImplicitOps) - goto special_insn; - - - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; - } - c->dst.orig_val = c->dst.val; - -special_insn: - - if (c->twobyte) - goto twobyte_insn; - - switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); - break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; - case 0x20 ... 0x23: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 0x50 ... 0x57: /* push reg */ - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->op_bytes); - c->dst.ptr = (void *) register_address( - ctxt->ss_base, c->regs[VCPU_REGS_RSP]); - break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - if ((rc = ops->read_std(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), c->dst.ptr, - c->op_bytes, ctxt->vcpu)) != 0) - goto done; - - register_address_increment(c->regs[VCPU_REGS_RSP], - c->op_bytes); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - c->dst.val = (s32) c->src.val; - break; - case 0x6a: /* push imm8 */ - c->src.val = 0L; - c->src.val = insn_fetch(s8, 1, c->eip); - emulate_push(ctxt); - break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(ctxt->es_base, - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c->override_base ? - *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x70 ... 0x7f: /* jcc (short) */ { - int rel = insn_fetch(s8, 1, c->eip); - - if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); - break; - } - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; - case 0x84 ... 0x85: - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 0x86 ... 0x87: /* xchg */ - /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - c->dst.val = c->src.val; - c->lock_prefix = 1; - break; - case 0x88 ... 0x8b: /* mov */ - goto mov; - case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_val; - break; - case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); - break; - case 0x9d: /* popf */ - c->dst.ptr = (unsigned long *) &ctxt->eflags; - goto pop_instruction; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; - case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) - goto done; - - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; - case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( - ctxt->es_base, - c->regs[VCPU_REGS_RDI]); - c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address( - c->override_base ? *c->override_base : - ctxt->ds_base, - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xc0 ... 0xc1: - emulate_grp2(ctxt); - break; - case 0xc3: /* ret */ - c->dst.ptr = &c->eip; - goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; - break; - case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; - emulate_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); - break; - case 0xe8: /* call (near) */ { - long int rel; - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } - c->src.val = (unsigned long) c->eip; - JMP_REL(rel); - c->op_bytes = c->ad_bytes; - emulate_push(ctxt); - break; - } - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - JMP_REL(c->src.val); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; - goto done; - case 0xf5: /* cmc */ - /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfb: /* sti */ - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ - rc = emulate_grp45(ctxt, ops); - if (rc != 0) - goto done; - break; - } - -writeback: - rc = writeback(ctxt, ops); - if (rc != 0) - goto done; - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; - -done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; - -twobyte_insn: - switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - - kvm_emulate_hypercall(ctxt->vcpu); - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, c->op_bytes); - if (rc) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3 && c->modrm_rm == 1) { - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - kvm_emulate_hypercall(ctxt->vcpu); - } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, - c->op_bytes); - if (rc) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - break; - case 4: /* smsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - *(u16 *)&c->regs[c->modrm_rm] - = realmode_get_cr(ctxt->vcpu, 0); - break; - case 6: /* lmsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, - &ctxt->eflags); - break; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); - break; - default: - goto cannot_emulate; - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 0x06: - emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 0x08: /* invd */ - case 0x09: /* wbinvd */ - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; - break; - case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)c->regs[VCPU_REGS_RAX] - | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x32: - /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; - } else { - c->regs[VCPU_REGS_RAX] = (u32)msr_data; - c->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x40 ... 0x4f: /* cmov */ - c->dst.val = c->dst.orig_val = c->src.val; - if (!test_cc(c->b, ctxt->eflags)) - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ { - long int rel; - - switch (c->op_bytes) { - case 2: - rel = insn_fetch(s16, 2, c->eip); - break; - case 4: - rel = insn_fetch(s32, 4, c->eip); - break; - case 8: - rel = insn_fetch(s64, 8, c->eip); - break; - default: - DPRINTF("jnz: Invalid op_bytes\n"); - goto cannot_emulate; - } - if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); - c->dst.type = OP_NONE; - break; - } - case 0xa3: - bt: /* bt */ - c->dst.type = OP_NONE; - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); - break; - case 0xab: - bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); - break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - c->src.orig_val = c->src.val; - c->src.val = c->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - c->dst.val = c->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); - break; - case 0xb6 ... 0xb7: /* movzx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (u8) c->src.val - : (u16) c->src.val; - break; - case 0xba: /* Grp8 */ - switch (c->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); - break; - case 0xbe ... 0xbf: /* movsx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : - (s16) c->src.val; - break; - case 0xc3: /* movnti */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : - (u64) c->src.val; - break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) - goto done; - c->dst.type = OP_NONE; - break; - } - goto writeback; - -cannot_emulate: - DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; - return -1; -} diff --git a/trunk/arch/x86/lguest/boot.c b/trunk/arch/x86/lguest/boot.c index 5afdde4895dc..a63373759f08 100644 --- a/trunk/arch/x86/lguest/boot.c +++ b/trunk/arch/x86/lguest/boot.c @@ -67,7 +67,6 @@ #include #include #include -#include /* for struct machine_ops */ /*G:010 Welcome to the Guest! * @@ -814,7 +813,7 @@ static void lguest_safe_halt(void) * rather than virtual addresses, so we use __pa() here. */ static void lguest_power_off(void) { - hcall(LHCALL_SHUTDOWN, __pa("Power down"), LGUEST_SHUTDOWN_POWEROFF, 0); + hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); } /* @@ -824,7 +823,7 @@ static void lguest_power_off(void) */ static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) { - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0); + hcall(LHCALL_CRASH, __pa(p), 0, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE; } @@ -928,11 +927,6 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0); -} - /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops * structures in the kernel provide points for (almost) every routine we have * to override to avoid privileged instructions. */ @@ -1066,7 +1060,6 @@ __init void lguest_init(void) * the Guest routine to power off. */ pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel(); diff --git a/trunk/block/bsg.c b/trunk/block/bsg.c index 8917c5174dc2..69b0a9d33306 100644 --- a/trunk/block/bsg.c +++ b/trunk/block/bsg.c @@ -279,7 +279,6 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr) goto out; } rq->next_rq = next_rq; - next_rq->cmd_type = rq->cmd_type; dxferp = (void*)(unsigned long)hdr->din_xferp; ret = blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len); diff --git a/trunk/drivers/Kconfig b/trunk/drivers/Kconfig index 08d4ae201597..f4076d9e9902 100644 --- a/trunk/drivers/Kconfig +++ b/trunk/drivers/Kconfig @@ -90,6 +90,8 @@ source "drivers/dca/Kconfig" source "drivers/auxdisplay/Kconfig" +source "drivers/kvm/Kconfig" + source "drivers/uio/Kconfig" source "drivers/virtio/Kconfig" diff --git a/trunk/drivers/Makefile b/trunk/drivers/Makefile index 0ee9a8a4095e..d92d4d82d001 100644 --- a/trunk/drivers/Makefile +++ b/trunk/drivers/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_SPI) += spi/ obj-$(CONFIG_PCCARD) += pcmcia/ obj-$(CONFIG_DIO) += dio/ obj-$(CONFIG_SBUS) += sbus/ +obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_ZORRO) += zorro/ obj-$(CONFIG_MAC) += macintosh/ obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ @@ -72,7 +73,7 @@ obj-$(CONFIG_ISDN) += isdn/ obj-$(CONFIG_EDAC) += edac/ obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ -obj-y += lguest/ +obj-$(CONFIG_LGUEST_GUEST) += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ obj-$(CONFIG_MMC) += mmc/ diff --git a/trunk/drivers/base/bus.c b/trunk/drivers/base/bus.c index 055989e94799..f484495b2ad1 100644 --- a/trunk/drivers/base/bus.c +++ b/trunk/drivers/base/bus.c @@ -163,6 +163,15 @@ static struct kset *bus_kset; #ifdef CONFIG_HOTPLUG /* Manually detach a device from its associated driver. */ +static int driver_helper(struct device *dev, void *data) +{ + const char *name = data; + + if (strcmp(name, dev->bus_id) == 0) + return 1; + return 0; +} + static ssize_t driver_unbind(struct device_driver *drv, const char *buf, size_t count) { @@ -170,7 +179,7 @@ static ssize_t driver_unbind(struct device_driver *drv, struct device *dev; int err = -ENODEV; - dev = bus_find_device_by_name(bus, NULL, buf); + dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); if (dev && dev->driver == drv) { if (dev->parent) /* Needed for USB */ down(&dev->parent->sem); @@ -197,7 +206,7 @@ static ssize_t driver_bind(struct device_driver *drv, struct device *dev; int err = -ENODEV; - dev = bus_find_device_by_name(bus, NULL, buf); + dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); if (dev && dev->driver == NULL) { if (dev->parent) /* Needed for USB */ down(&dev->parent->sem); @@ -241,7 +250,7 @@ static ssize_t store_drivers_probe(struct bus_type *bus, { struct device *dev; - dev = bus_find_device_by_name(bus, NULL, buf); + dev = bus_find_device(bus, NULL, (void *)buf, driver_helper); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) != 0) @@ -329,32 +338,6 @@ struct device *bus_find_device(struct bus_type *bus, } EXPORT_SYMBOL_GPL(bus_find_device); -static int match_name(struct device *dev, void *data) -{ - const char *name = data; - - if (strcmp(name, dev->bus_id) == 0) - return 1; - return 0; -} - -/** - * bus_find_device_by_name - device iterator for locating a particular device of a specific name - * @bus: bus type - * @start: Device to begin with - * @name: name of the device to match - * - * This is similar to the bus_find_device() function above, but it handles - * searching by a name automatically, no need to write another strcmp matching - * function. - */ -struct device *bus_find_device_by_name(struct bus_type *bus, - struct device *start, const char *name) -{ - return bus_find_device(bus, start, (void *)name, match_name); -} -EXPORT_SYMBOL_GPL(bus_find_device_by_name); - static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); diff --git a/trunk/drivers/base/class.c b/trunk/drivers/base/class.c index 9d915376c313..59cf35894cfc 100644 --- a/trunk/drivers/base/class.c +++ b/trunk/drivers/base/class.c @@ -149,7 +149,7 @@ int class_register(struct class *cls) if (error) return error; -#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) +#ifdef CONFIG_SYSFS_DEPRECATED /* let the block class directory show up in the root of sysfs */ if (cls != &block_class) cls->subsys.kobj.kset = class_kset; @@ -863,7 +863,7 @@ EXPORT_SYMBOL_GPL(class_for_each_device); * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. - * + * Note, you will need to drop the reference with put_device() after use. * * We hold class->sem in this function, so it can not be diff --git a/trunk/drivers/base/core.c b/trunk/drivers/base/core.c index b1727876182c..edf3bbeb8d6a 100644 --- a/trunk/drivers/base/core.c +++ b/trunk/drivers/base/core.c @@ -27,17 +27,9 @@ int (*platform_notify)(struct device *dev) = NULL; int (*platform_notify_remove)(struct device *dev) = NULL; -#ifdef CONFIG_BLOCK -static inline int device_is_not_partition(struct device *dev) -{ - return !(dev->type == &part_type); -} -#else -static inline int device_is_not_partition(struct device *dev) -{ - return 1; -} -#endif +/* + * sysfs bindings for devices. + */ /** * dev_driver_string - Return a device's driver name, if at all possible @@ -660,14 +652,14 @@ static int device_add_class_symlinks(struct device *dev) #ifdef CONFIG_SYSFS_DEPRECATED /* stacked class devices need a symlink in the class directory */ if (dev->kobj.parent != &dev->class->subsys.kobj && - device_is_not_partition(dev)) { + dev->type != &part_type) { error = sysfs_create_link(&dev->class->subsys.kobj, &dev->kobj, dev->bus_id); if (error) goto out_subsys; } - if (dev->parent && device_is_not_partition(dev)) { + if (dev->parent && dev->type != &part_type) { struct device *parent = dev->parent; char *class_name; @@ -696,11 +688,11 @@ static int device_add_class_symlinks(struct device *dev) return 0; out_device: - if (dev->parent && device_is_not_partition(dev)) + if (dev->parent && dev->type != &part_type) sysfs_remove_link(&dev->kobj, "device"); out_busid: if (dev->kobj.parent != &dev->class->subsys.kobj && - device_is_not_partition(dev)) + dev->type != &part_type) sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); #else /* link in the class directory pointing to the device */ @@ -709,7 +701,7 @@ static int device_add_class_symlinks(struct device *dev) if (error) goto out_subsys; - if (dev->parent && device_is_not_partition(dev)) { + if (dev->parent && dev->type != &part_type) { error = sysfs_create_link(&dev->kobj, &dev->parent->kobj, "device"); if (error) @@ -733,7 +725,7 @@ static void device_remove_class_symlinks(struct device *dev) return; #ifdef CONFIG_SYSFS_DEPRECATED - if (dev->parent && device_is_not_partition(dev)) { + if (dev->parent && dev->type != &part_type) { char *class_name; class_name = make_class_name(dev->class->name, &dev->kobj); @@ -745,10 +737,10 @@ static void device_remove_class_symlinks(struct device *dev) } if (dev->kobj.parent != &dev->class->subsys.kobj && - device_is_not_partition(dev)) + dev->type != &part_type) sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); #else - if (dev->parent && device_is_not_partition(dev)) + if (dev->parent && dev->type != &part_type) sysfs_remove_link(&dev->kobj, "device"); sysfs_remove_link(&dev->class->subsys.kobj, dev->bus_id); diff --git a/trunk/drivers/ieee1394/sbp2.c b/trunk/drivers/ieee1394/sbp2.c index 1eda11abeb1e..96eac0b53019 100644 --- a/trunk/drivers/ieee1394/sbp2.c +++ b/trunk/drivers/ieee1394/sbp2.c @@ -1489,7 +1489,7 @@ static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb, /* loop through and fill out our SBP-2 page tables * (and split up anything too large) */ - for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) { + for (i = 0, sg_count = 0; i < count; i++, sgpnt = sg_next(sgpnt)) { sg_len = sg_dma_len(sgpnt); sg_addr = sg_dma_address(sgpnt); while (sg_len) { diff --git a/trunk/drivers/infiniband/ulp/srp/ib_srp.c b/trunk/drivers/infiniband/ulp/srp/ib_srp.c index 195ce7c12319..f2d2c7e2c76b 100644 --- a/trunk/drivers/infiniband/ulp/srp/ib_srp.c +++ b/trunk/drivers/infiniband/ulp/srp/ib_srp.c @@ -1571,6 +1571,7 @@ static struct scsi_host_template srp_template = { .this_id = -1, .cmd_per_lun = SRP_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = srp_host_attrs }; diff --git a/trunk/arch/x86/kvm/Kconfig b/trunk/drivers/kvm/Kconfig similarity index 94% rename from trunk/arch/x86/kvm/Kconfig rename to trunk/drivers/kvm/Kconfig index c83e1c9b5129..656920636cb2 100644 --- a/trunk/arch/x86/kvm/Kconfig +++ b/trunk/drivers/kvm/Kconfig @@ -1,12 +1,9 @@ # # KVM configuration # -config HAVE_KVM - bool - menuconfig VIRTUALIZATION bool "Virtualization" - depends on HAVE_KVM || X86 + depends on X86 default y ---help--- Say Y here to get to see options for using your Linux host to run other @@ -19,7 +16,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM && EXPERIMENTAL + depends on X86 && EXPERIMENTAL select PREEMPT_NOTIFIERS select ANON_INODES ---help--- diff --git a/trunk/arch/x86/kvm/Makefile b/trunk/drivers/kvm/Makefile similarity index 51% rename from trunk/arch/x86/kvm/Makefile rename to trunk/drivers/kvm/Makefile index ffdd0b310784..e5a8f4d3e973 100644 --- a/trunk/arch/x86/kvm/Makefile +++ b/trunk/drivers/kvm/Makefile @@ -2,11 +2,7 @@ # Makefile for Kernel-based Virtual Machine module # -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) - -EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm - -kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/trunk/arch/x86/kvm/i8259.c b/trunk/drivers/kvm/i8259.c similarity index 98% rename from trunk/arch/x86/kvm/i8259.c rename to trunk/drivers/kvm/i8259.c index ab29cf2def47..a679157bc599 100644 --- a/trunk/arch/x86/kvm/i8259.c +++ b/trunk/drivers/kvm/i8259.c @@ -28,8 +28,6 @@ #include #include "irq.h" -#include - /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -183,8 +181,10 @@ int kvm_pic_read_irq(struct kvm_pic *s) return intno; } -void kvm_pic_reset(struct kvm_kpic_state *s) +static void pic_reset(void *opaque) { + struct kvm_kpic_state *s = opaque; + s->last_irr = 0; s->irr = 0; s->imr = 0; @@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - kvm_pic_reset(s); /* init */ + pic_reset(s); /* init */ /* * deassert a pending interrupt */ diff --git a/trunk/virt/kvm/ioapic.c b/trunk/drivers/kvm/ioapic.c similarity index 83% rename from trunk/virt/kvm/ioapic.c rename to trunk/drivers/kvm/ioapic.c index 317f8e211cd2..c7992e667fdb 100644 --- a/trunk/virt/kvm/ioapic.c +++ b/trunk/drivers/kvm/ioapic.c @@ -26,7 +26,7 @@ * Based on Xen 3.1 code. */ -#include +#include "kvm.h" #include #include #include @@ -34,17 +34,14 @@ #include #include #include +#include #include #include - -#include "ioapic.h" -#include "lapic.h" - -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) -#else +#include +#include +#include "irq.h" +/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ #define ioapic_debug(fmt, arg...) -#endif static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, @@ -116,7 +113,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) default: index = (ioapic->ioregsel - 0x10) >> 1; - ioapic_debug("change redir index %x val %x\n", index, val); + ioapic_debug("change redir index %x val %x", index, val); if (index >= IOAPIC_NUM_PINS) return; if (ioapic->ioregsel & 1) { @@ -134,16 +131,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } static void ioapic_inj_irq(struct kvm_ioapic *ioapic, - struct kvm_vcpu *vcpu, + struct kvm_lapic *target, u8 vector, u8 trig_mode, u8 delivery_mode) { - ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, + ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, delivery_mode); - ASSERT((delivery_mode == IOAPIC_FIXED) || - (delivery_mode == IOAPIC_LOWEST_PRIORITY)); + ASSERT((delivery_mode == dest_Fixed) || + (delivery_mode == dest_LowestPrio)); - kvm_apic_set_irq(vcpu, vector, trig_mode); + kvm_apic_set_irq(target, vector, trig_mode); } static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, @@ -154,12 +151,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, struct kvm *kvm = ioapic->kvm; struct kvm_vcpu *vcpu; - ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); + ioapic_debug("dest %d dest_mode %d", dest, dest_mode); if (dest_mode == 0) { /* Physical mode. */ if (dest == 0xFF) { /* Broadcast. */ for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) + if (kvm->vcpus[i] && kvm->vcpus[i]->apic) mask |= 1 << i; return mask; } @@ -167,8 +164,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, vcpu = kvm->vcpus[i]; if (!vcpu) continue; - if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { - if (vcpu->arch.apic) + if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { + if (vcpu->apic) mask = 1 << i; break; } @@ -178,11 +175,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, vcpu = kvm->vcpus[i]; if (!vcpu) continue; - if (vcpu->arch.apic && - kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) + if (vcpu->apic && + kvm_apic_match_logical_addr(vcpu->apic, dest)) mask |= 1 << vcpu->vcpu_id; } - ioapic_debug("mask %x\n", mask); + ioapic_debug("mask %x", mask); return mask; } @@ -194,39 +191,41 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) u8 vector = ioapic->redirtbl[irq].fields.vector; u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; u32 deliver_bitmask; + struct kvm_lapic *target; struct kvm_vcpu *vcpu; int vcpu_id; ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " - "vector=%x trig_mode=%x\n", + "vector=%x trig_mode=%x", dest, dest_mode, delivery_mode, vector, trig_mode); deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); if (!deliver_bitmask) { - ioapic_debug("no target on destination\n"); + ioapic_debug("no target on destination"); return; } switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, - deliver_bitmask); - if (vcpu != NULL) - ioapic_inj_irq(ioapic, vcpu, vector, + case dest_LowestPrio: + target = + kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); + if (target != NULL) + ioapic_inj_irq(ioapic, target, vector, trig_mode, delivery_mode); else - ioapic_debug("null lowest prio vcpu: " - "mask=%x vector=%x delivery_mode=%x\n", - deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); + ioapic_debug("null round robin: " + "mask=%x vector=%x delivery_mode=%x", + deliver_bitmask, vector, dest_LowestPrio); break; - case IOAPIC_FIXED: + case dest_Fixed: for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { if (!(deliver_bitmask & (1 << vcpu_id))) continue; deliver_bitmask &= ~(1 << vcpu_id); vcpu = ioapic->kvm->vcpus[vcpu_id]; if (vcpu) { - ioapic_inj_irq(ioapic, vcpu, vector, + target = vcpu->apic; + ioapic_inj_irq(ioapic, target, vector, trig_mode, delivery_mode); } } @@ -272,7 +271,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; + struct kvm_ioapic *ioapic = kvm->vioapic; union ioapic_redir_entry *ent; int gsi; @@ -305,7 +304,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; u32 result; - ioapic_debug("addr %lx\n", (unsigned long)addr); + ioapic_debug("addr %lx", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -342,8 +341,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; u32 data; - ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", - (void*)addr, len, val); + ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", + addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ if (len == 4 || len == 8) data = *(u32 *) val; @@ -361,38 +360,24 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, case IOAPIC_REG_WINDOW: ioapic_write_indirect(ioapic, data); break; -#ifdef CONFIG_IA64 - case IOAPIC_REG_EOI: - kvm_ioapic_update_eoi(ioapic->kvm, data); - break; -#endif default: break; } } -void kvm_ioapic_reset(struct kvm_ioapic *ioapic) -{ - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - ioapic->redirtbl[i].fields.mask = 1; - ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; - ioapic->ioregsel = 0; - ioapic->irr = 0; - ioapic->id = 0; -} - int kvm_ioapic_init(struct kvm *kvm) { struct kvm_ioapic *ioapic; + int i; ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); if (!ioapic) return -ENOMEM; - kvm->arch.vioapic = ioapic; - kvm_ioapic_reset(ioapic); + kvm->vioapic = ioapic; + for (i = 0; i < IOAPIC_NUM_PINS; i++) + ioapic->redirtbl[i].fields.mask = 1; + ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; ioapic->dev.read = ioapic_mmio_read; ioapic->dev.write = ioapic_mmio_write; ioapic->dev.in_range = ioapic_in_range; diff --git a/trunk/arch/x86/kvm/irq.c b/trunk/drivers/kvm/irq.c similarity index 81% rename from trunk/arch/x86/kvm/irq.c rename to trunk/drivers/kvm/irq.c index e5714759e97f..7628c7ff628f 100644 --- a/trunk/arch/x86/kvm/irq.c +++ b/trunk/drivers/kvm/irq.c @@ -20,8 +20,8 @@ */ #include -#include +#include "kvm.h" #include "irq.h" /* @@ -63,6 +63,26 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) } EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); +static void vcpu_kick_intr(void *info) +{ +#ifdef DEBUG + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; + printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); +#endif +} + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int ipi_pcpu = vcpu->cpu; + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + ++vcpu->stat.halt_wakeup; + } + if (vcpu->guest_mode) + smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); +} + void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); diff --git a/trunk/drivers/kvm/irq.h b/trunk/drivers/kvm/irq.h new file mode 100644 index 000000000000..11fc014e2b30 --- /dev/null +++ b/trunk/drivers/kvm/irq.h @@ -0,0 +1,165 @@ +/* + * irq.h: in kernel interrupt controller related definitions + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +#include "kvm.h" + +typedef void irq_request_func(void *opaque, int level); + +struct kvm_kpic_state { + u8 last_irr; /* edge detection */ + u8 irr; /* interrupt request register */ + u8 imr; /* interrupt mask register */ + u8 isr; /* interrupt service register */ + u8 priority_add; /* highest irq priority */ + u8 irq_base; + u8 read_reg_select; + u8 poll; + u8 special_mask; + u8 init_state; + u8 auto_eoi; + u8 rotate_on_auto_eoi; + u8 special_fully_nested_mode; + u8 init4; /* true if 4 byte init */ + u8 elcr; /* PIIX edge/trigger selection */ + u8 elcr_mask; + struct kvm_pic *pics_state; +}; + +struct kvm_pic { + struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + irq_request_func *irq_request; + void *irq_request_opaque; + int output; /* intr from master PIC */ + struct kvm_io_device dev; +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_pic_set_irq(void *opaque, int irq, int level); +int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +int kvm_cpu_has_interrupt(struct kvm_vcpu *v); +void kvm_pic_update_irq(struct kvm_pic *s); + +#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS +#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ +#define IOAPIC_EDGE_TRIG 0 +#define IOAPIC_LEVEL_TRIG 1 + +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 +#define IOAPIC_MEM_LENGTH 0x100 + +/* Direct registers. */ +#define IOAPIC_REG_SELECT 0x00 +#define IOAPIC_REG_WINDOW 0x10 +#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ + +/* Indirect registers. */ +#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ +#define IOAPIC_REG_VERSION 0x01 +#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ + +struct kvm_ioapic { + u64 base_address; + u32 ioregsel; + u32 id; + u32 irr; + u32 pad; + union ioapic_redir_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; + } redirtbl[IOAPIC_NUM_PINS]; + struct kvm_io_device dev; + struct kvm *kvm; +}; + +struct kvm_lapic { + unsigned long base_address; + struct kvm_io_device dev; + struct { + atomic_t pending; + s64 period; /* unit: ns */ + u32 divide_count; + ktime_t last_update; + struct hrtimer dev; + } timer; + struct kvm_vcpu *vcpu; + struct page *regs_page; + void *regs; +}; + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +void kvm_free_apic(struct kvm_lapic *apic); +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); +struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, + unsigned long bitmap); +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +int kvm_ioapic_init(struct kvm *kvm); +void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); + +#endif diff --git a/trunk/include/asm-x86/kvm_host.h b/trunk/drivers/kvm/kvm.h similarity index 64% rename from trunk/include/asm-x86/kvm_host.h rename to trunk/drivers/kvm/kvm.h index 4702b04b979a..3b0bc4bda5f2 100644 --- a/trunk/include/asm-x86/kvm_host.h +++ b/trunk/drivers/kvm/kvm.h @@ -1,24 +1,23 @@ -#/* - * Kernel-based Virtual Machine driver for Linux - * - * This header defines architecture specific interfaces, x86 version - * +#ifndef __KVM_H +#define __KVM_H + +/* * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. - * */ -#ifndef ASM_KVM_HOST_H -#define ASM_KVM_HOST_H - #include +#include +#include +#include +#include +#include #include +#include +#include #include #include -#include - -#include #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) @@ -38,8 +37,15 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) +#define KVM_MAX_VCPUS 4 +#define KVM_ALIAS_SLOTS 4 +#define KVM_MEMORY_SLOTS 8 +#define KVM_NUM_MMU_PAGES 1024 +#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_REFILL_PAGES 25 +#define KVM_MAX_CPUID_ENTRIES 40 + #define DE_VECTOR 0 -#define UD_VECTOR 6 #define NM_VECTOR 7 #define DF_VECTOR 8 #define TS_VECTOR 10 @@ -53,66 +59,31 @@ #define IOPL_SHIFT 12 -#define KVM_ALIAS_SLOTS 4 - -#define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_NUM_MMU_PAGES 1024 -#define KVM_MIN_FREE_MMU_PAGES 5 -#define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 - -extern spinlock_t kvm_lock; -extern struct list_head vm_list; - -struct kvm_vcpu; -struct kvm; - -enum { - VCPU_REGS_RAX = 0, - VCPU_REGS_RCX = 1, - VCPU_REGS_RDX = 2, - VCPU_REGS_RBX = 3, - VCPU_REGS_RSP = 4, - VCPU_REGS_RBP = 5, - VCPU_REGS_RSI = 6, - VCPU_REGS_RDI = 7, -#ifdef CONFIG_X86_64 - VCPU_REGS_R8 = 8, - VCPU_REGS_R9 = 9, - VCPU_REGS_R10 = 10, - VCPU_REGS_R11 = 11, - VCPU_REGS_R12 = 12, - VCPU_REGS_R13 = 13, - VCPU_REGS_R14 = 14, - VCPU_REGS_R15 = 15, -#endif - NR_VCPU_REGS -}; - -enum { - VCPU_SREG_CS, - VCPU_SREG_DS, - VCPU_SREG_ES, - VCPU_SREG_FS, - VCPU_SREG_GS, - VCPU_SREG_SS, - VCPU_SREG_TR, - VCPU_SREG_LDTR, -}; - -#include +#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_NR_MEM_OBJS 40 +/* + * vcpu->requests bit members + */ +#define KVM_TLB_FLUSH 0 /* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. + * Address types: + * + * gva - guest virtual address + * gpa - guest physical address + * gfn - guest frame number + * hva - host virtual address + * hpa - host physical address + * hfn - host frame number */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; + +typedef unsigned long gva_t; +typedef u64 gpa_t; +typedef unsigned long gfn_t; + +typedef unsigned long hva_t; +typedef u64 hpa_t; +typedef unsigned long hfn_t; #define NR_PTE_CHAIN_ENTRIES 5 @@ -128,7 +99,7 @@ struct kvm_pte_chain { * bits 4:7 - page table level for this shadow (1-4) * bits 8:9 - page table quadrant for 2-level guests * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) - * bits 17:19 - common access permissions for all ptes in this shadow page + * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde */ union kvm_mmu_page_role { unsigned word; @@ -138,7 +109,7 @@ union kvm_mmu_page_role { unsigned quadrant : 2; unsigned pad_for_nice_hex_output : 6; unsigned metaphysical : 1; - unsigned access : 3; + unsigned hugepage_access : 3; }; }; @@ -154,8 +125,6 @@ struct kvm_mmu_page { union kvm_mmu_page_role role; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; unsigned long slot_bitmap; /* One bit set per slot which has memory * in this shadow page. */ @@ -167,6 +136,9 @@ struct kvm_mmu_page { }; }; +struct kvm_vcpu; +extern struct kmem_cache *kvm_vcpu_cache; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -177,8 +149,6 @@ struct kvm_mmu { int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); - void (*prefetch_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *page); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -186,9 +156,159 @@ struct kvm_mmu { u64 *pae_root; }; -struct kvm_vcpu_arch { +#define KVM_NR_MEM_OBJS 20 + +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_guest_debug { + int enabled; + unsigned long bp[4]; + int singlestep; +}; + +enum { + VCPU_REGS_RAX = 0, + VCPU_REGS_RCX = 1, + VCPU_REGS_RDX = 2, + VCPU_REGS_RBX = 3, + VCPU_REGS_RSP = 4, + VCPU_REGS_RBP = 5, + VCPU_REGS_RSI = 6, + VCPU_REGS_RDI = 7, +#ifdef CONFIG_X86_64 + VCPU_REGS_R8 = 8, + VCPU_REGS_R9 = 9, + VCPU_REGS_R10 = 10, + VCPU_REGS_R11 = 11, + VCPU_REGS_R12 = 12, + VCPU_REGS_R13 = 13, + VCPU_REGS_R14 = 14, + VCPU_REGS_R15 = 15, +#endif + NR_VCPU_REGS +}; + +enum { + VCPU_SREG_CS, + VCPU_SREG_DS, + VCPU_SREG_ES, + VCPU_SREG_FS, + VCPU_SREG_GS, + VCPU_SREG_SS, + VCPU_SREG_TR, + VCPU_SREG_LDTR, +}; + +struct kvm_pio_request { + unsigned long count; + int cur_count; + struct page *guest_pages[2]; + unsigned guest_page_offset; + int in; + int port; + int size; + int string; + int down; + int rep; +}; + +struct kvm_stat { + u32 pf_fixed; + u32 pf_guest; + u32 tlb_flush; + u32 invlpg; + + u32 exits; + u32 io_exits; + u32 mmio_exits; + u32 signal_exits; + u32 irq_window_exits; + u32 halt_exits; + u32 halt_wakeup; + u32 request_irq_exits; + u32 irq_exits; + u32 light_exits; + u32 efer_reload; +}; + +struct kvm_io_device { + void (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + void (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + int (*in_range)(struct kvm_io_device *this, gpa_t addr); + void (*destructor)(struct kvm_io_device *this); + + void *private; +}; + +static inline void kvm_iodevice_read(struct kvm_io_device *dev, + gpa_t addr, + int len, + void *val) +{ + dev->read(dev, addr, len, val); +} + +static inline void kvm_iodevice_write(struct kvm_io_device *dev, + gpa_t addr, + int len, + const void *val) +{ + dev->write(dev, addr, len, val); +} + +static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) +{ + return dev->in_range(dev, addr); +} + +static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->destructor) + dev->destructor(dev); +} + +/* + * It would be nice to use something smarter than a linear search, TBD... + * Thankfully we dont expect many devices to register (famous last words :), + * so until then it will suffice. At least its abstracted so we can change + * in one place. + */ +struct kvm_io_bus { + int dev_count; +#define NR_IOBUS_DEVS 6 + struct kvm_io_device *devs[NR_IOBUS_DEVS]; +}; + +void kvm_io_bus_init(struct kvm_io_bus *bus); +void kvm_io_bus_destroy(struct kvm_io_bus *bus); +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, + struct kvm_io_device *dev); + +struct kvm_vcpu { + struct kvm *kvm; + struct preempt_notifier preempt_notifier; + int vcpu_id; + struct mutex mutex; + int cpu; u64 host_tsc; + struct kvm_run *run; int interrupt_window_open; + int guest_mode; + unsigned long requests; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ @@ -197,6 +317,9 @@ struct kvm_vcpu_arch { unsigned long cr0; unsigned long cr2; unsigned long cr3; + gpa_t para_state_gpa; + struct page *para_state_page; + gpa_t hypercall_gpa; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -211,7 +334,6 @@ struct kvm_vcpu_arch { int mp_state; int sipi_vector; u64 ia32_misc_enable_msr; - bool tpr_access_reporting; struct kvm_mmu mmu; @@ -222,26 +344,29 @@ struct kvm_vcpu_arch { gfn_t last_pt_write_gfn; int last_pt_write_count; - u64 *last_pte_updated; - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - struct page *page; /* page corresponding to that gfn */ - } update_pte; + struct kvm_guest_debug guest_debug; struct i387_fxsave_struct host_fx_image; struct i387_fxsave_struct guest_fx_image; - + int fpu_active; + int guest_fpu_loaded; + + int mmio_needed; + int mmio_read_completed; + int mmio_is_write; + int mmio_size; + unsigned char mmio_data[8]; + gpa_t mmio_phys_addr; gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; + wait_queue_head_t wq; - struct kvm_queued_exception { - bool pending; - bool has_error_code; - u8 nr; - u32 error_code; - } exception; + int sigset_active; + sigset_t sigset; + + struct kvm_stat stat; struct { int active; @@ -256,10 +381,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; - /* emulate context */ - - struct x86_emulate_ctxt emulate_ctxt; + struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; }; struct kvm_mem_alias { @@ -268,58 +390,51 @@ struct kvm_mem_alias { gfn_t target_gfn; }; -struct kvm_arch{ +struct kvm_memory_slot { + gfn_t base_gfn; + unsigned long npages; + unsigned long flags; + struct page **phys_mem; + unsigned long *dirty_bitmap; +}; + +struct kvm { + struct mutex lock; /* protects everything except vcpus */ int naliases; struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - - unsigned int n_free_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; - struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + int nmemslots; + struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + int n_free_mmu_pages; + struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + unsigned long rmap_overflow; + struct list_head vm_list; + struct file *filp; + struct kvm_io_bus mmio_bus; + struct kvm_io_bus pio_bus; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; - int round_robin_prev_vcpu; - unsigned int tss_addr; - struct page *apic_access_page; }; -struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 mmu_cache_miss; - u32 remote_tlb_flush; -}; +static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) +{ + return kvm->vpic; +} -struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; +static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) +{ + return kvm->vioapic; +} - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 halt_exits; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; -}; +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return pic_irqchip(kvm) != 0; +} struct descriptor_table { u16 limit; @@ -334,12 +449,11 @@ struct kvm_x86_ops { void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ - bool (*cpu_has_accelerated_tpr)(void); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - int (*vcpu_reset)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); @@ -375,6 +489,10 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + unsigned long addr, u32 err_code); + + void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -383,31 +501,54 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); int (*get_irq)(struct kvm_vcpu *vcpu); void (*set_irq)(struct kvm_vcpu *vcpu, int vec); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code); - bool (*exception_injected)(struct kvm_vcpu *vcpu); void (*inject_pending_irq)(struct kvm_vcpu *vcpu); void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, struct kvm_run *run); - - int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); }; extern struct kvm_x86_ops *kvm_x86_ops; +/* The guest did something we don't support. */ +#define pr_unimpl(vcpu, fmt, ...) \ + do { \ + if (printk_ratelimit()) \ + printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ + current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ + } while(0) + +#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) +#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); + +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module); +void kvm_exit_x86(void); + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); -void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_zap_all(struct kvm *kvm); -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); +#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) +#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) +static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); + +extern hpa_t bad_page_address; + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +void mark_page_dirty(struct kvm *kvm, gfn_t gfn); enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -415,10 +556,8 @@ enum emulation_result { EMULATE_FAIL, /* can't emulate this instruction */ }; -#define EMULTYPE_NO_DECODE (1 << 0) -#define EMULTYPE_TRAP_UD (1 << 1) int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long cr2, u16 error_code, int emulation_type); + unsigned long cr2, u16 error_code); void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); @@ -433,7 +572,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port); int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned long count, int down, @@ -442,7 +581,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest); int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value); @@ -458,15 +597,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); - void fx_init(struct kvm_vcpu *vcpu); +void kvm_resched(struct kvm_vcpu *vcpu); +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); +void kvm_flush_remote_tlbs(struct kvm *kvm); + int emulator_read_std(unsigned long addr, - void *val, + void *val, unsigned int bytes, struct kvm_vcpu *vcpu); int emulator_write_emulated(unsigned long addr, @@ -476,7 +615,6 @@ int emulator_write_emulated(unsigned long addr, unsigned long segment_base(u16 selector); -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); @@ -484,14 +622,66 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); +static inline void kvm_guest_enter(void) +{ + current->flags |= PF_VCPU; +} -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +static inline void kvm_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); -int complete_pio(struct kvm_vcpu *vcpu); +static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + return vcpu->mmu.page_fault(vcpu, gva, error_code); +} + +static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + __kvm_mmu_free_some_pages(vcpu); +} + +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->shadow_efer & EFER_LME; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & X86_CR4_PAE; +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return vcpu->cr4 & X86_CR4_PSE; +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & X86_CR0_PG; +} + +static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + return slot - kvm->memslots; +} static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { @@ -503,55 +693,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) static inline u16 read_fs(void) { u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); + asm ("mov %%fs, %0" : "=g"(seg)); return seg; } static inline u16 read_gs(void) { u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); + asm ("mov %%gs, %0" : "=g"(seg)); return seg; } static inline u16 read_ldt(void) { u16 ldt; - asm("sldt %0" : "=g"(ldt)); + asm ("sldt %0" : "=g"(ldt)); return ldt; } static inline void load_fs(u16 sel) { - asm("mov %0, %%fs" : : "rm"(sel)); + asm ("mov %0, %%fs" : : "rm"(sel)); } static inline void load_gs(u16 sel) { - asm("mov %0, %%gs" : : "rm"(sel)); + asm ("mov %0, %%gs" : : "rm"(sel)); } #ifndef load_ldt static inline void load_ldt(u16 sel) { - asm("lldt %0" : : "rm"(sel)); + asm ("lldt %0" : : "rm"(sel)); } #endif static inline void get_idt(struct descriptor_table *table) { - asm("sidt %0" : "=m"(*table)); + asm ("sidt %0" : "=m"(*table)); } static inline void get_gdt(struct descriptor_table *table) { - asm("sgdt %0" : "=m"(*table)); + asm ("sgdt %0" : "=m"(*table)); } static inline unsigned long read_tr_base(void) { u16 tr; - asm("str %0" : "=g"(tr)); + asm ("str %0" : "=g"(tr)); return segment_base(tr); } @@ -567,17 +757,17 @@ static inline unsigned long read_msr(unsigned long msr) static inline void fx_save(struct i387_fxsave_struct *image) { - asm("fxsave (%0)":: "r" (image)); + asm ("fxsave (%0)":: "r" (image)); } static inline void fx_restore(struct i387_fxsave_struct *image) { - asm("fxrstor (%0)":: "r" (image)); + asm ("fxrstor (%0)":: "r" (image)); } static inline void fpu_init(void) { - asm("finit"); + asm ("finit"); } static inline u32 get_rdx_init_val(void) @@ -585,11 +775,6 @@ static inline u32 get_rdx_init_val(void) return 0x600; /* P6 family */ } -static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) -{ - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); -} - #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" diff --git a/trunk/arch/x86/kvm/x86.c b/trunk/drivers/kvm/kvm_main.c similarity index 52% rename from trunk/arch/x86/kvm/x86.c rename to trunk/drivers/kvm/kvm_main.c index 8f94a0b89dff..c0f372f1d761 100644 --- a/trunk/arch/x86/kvm/x86.c +++ b/trunk/drivers/kvm/kvm_main.c @@ -1,7 +1,8 @@ /* * Kernel-based Virtual Machine driver for Linux * - * derived from drivers/kvm/kvm_main.c + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * @@ -14,22 +15,80 @@ * */ -#include +#include "kvm.h" +#include "x86_emulate.h" #include "segment_descriptor.h" #include "irq.h" -#include "mmu.h" #include -#include -#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include - -#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include +#include +#include +#include + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static DEFINE_SPINLOCK(kvm_lock); +static LIST_HEAD(vm_list); + +static cpumask_t cpus_hardware_enabled; + +struct kvm_x86_ops *kvm_x86_ops; +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +static __read_mostly struct preempt_ops kvm_preempt_ops; + +#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) + +static struct kvm_stats_debugfs_item { + const char *name; + int offset; + struct dentry *dentry; +} debugfs_entries[] = { + { "pf_fixed", STAT_OFFSET(pf_fixed) }, + { "pf_guest", STAT_OFFSET(pf_guest) }, + { "tlb_flush", STAT_OFFSET(tlb_flush) }, + { "invlpg", STAT_OFFSET(invlpg) }, + { "exits", STAT_OFFSET(exits) }, + { "io_exits", STAT_OFFSET(io_exits) }, + { "mmio_exits", STAT_OFFSET(mmio_exits) }, + { "signal_exits", STAT_OFFSET(signal_exits) }, + { "irq_window", STAT_OFFSET(irq_window_exits) }, + { "halt_exits", STAT_OFFSET(halt_exits) }, + { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, + { "request_irq", STAT_OFFSET(request_irq_exits) }, + { "irq_exits", STAT_OFFSET(irq_exits) }, + { "light_exits", STAT_OFFSET(light_exits) }, + { "efer_reload", STAT_OFFSET(efer_reload) }, + { NULL } +}; + +static struct dentry *debugfs_dir; #define MAX_IO_MSRS 256 + #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ @@ -43,151 +102,317 @@ #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define EFER_RESERVED_BITS 0xfffffffffffff2fe -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU - -struct kvm_x86_ops *kvm_x86_ops; - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { NULL } +#ifdef CONFIG_X86_64 +// LDT or TSS descriptor in the GDT. 16 bytes. +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; }; +#endif + +static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg); unsigned long segment_base(u16 selector) { struct descriptor_table gdt; struct segment_descriptor *d; unsigned long table_base; + typedef unsigned long ul; unsigned long v; if (selector == 0) return 0; - asm("sgdt %0" : "=m"(gdt)); + asm ("sgdt %0" : "=m"(gdt)); table_base = gdt.base; if (selector & 4) { /* from ldt */ u16 ldt_selector; - asm("sldt %0" : "=g"(ldt_selector)); + asm ("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct segment_descriptor *)(table_base + (selector & ~7)); - v = d->base_low | ((unsigned long)d->base_mid << 16) | - ((unsigned long)d->base_high << 24); + v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); #ifdef CONFIG_X86_64 - if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long) \ - ((struct segment_descriptor_64 *)d)->base_higher) << 32; + if (d->system == 0 + && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; #endif return v; } EXPORT_SYMBOL_GPL(segment_base); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +static inline int valid_vcpu(int n) { - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->arch.apic_base; - else - return vcpu->arch.apic_base; + return likely(n >= 0 && n < KVM_MAX_VCPUS); } -EXPORT_SYMBOL_GPL(kvm_get_apic_base); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); + if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 1; + fx_save(&vcpu->host_fx_image); + fx_restore(&vcpu->guest_fx_image); +} +EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); + +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 0; + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); +} +EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); + +/* + * Switches to specified vcpu, until a matching vcpu_put() + */ +static void vcpu_load(struct kvm_vcpu *vcpu) +{ + int cpu; + + mutex_lock(&vcpu->mutex); + cpu = get_cpu(); + preempt_notifier_register(&vcpu->preempt_notifier); + kvm_x86_ops->vcpu_load(vcpu, cpu); + put_cpu(); +} + +static void vcpu_put(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + kvm_x86_ops->vcpu_put(vcpu); + preempt_notifier_unregister(&vcpu->preempt_notifier); + preempt_enable(); + mutex_unlock(&vcpu->mutex); +} + +static void ack_flush(void *_completed) +{ +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + + cpus_clear(cpus); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (!vcpu) + continue; + if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) + continue; + cpu = vcpu->cpu; + if (cpu != -1 && cpu != raw_smp_processor_id()) + cpu_set(cpu, cpus); + } + smp_call_function_mask(cpus, ack_flush, NULL, 1); +} + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu = -1; + vcpu->mmu.root_hpa = INVALID_PAGE; + vcpu->kvm = kvm; + vcpu->vcpu_id = id; + if (!irqchip_in_kernel(kvm) || id == 0) + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; else - vcpu->arch.apic_base = data; + vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; + init_waitqueue_head(&vcpu->wq); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->run = page_address(page); + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail_free_run; + } + vcpu->pio_data = page_address(page); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; + + return 0; + +fail_free_pio_data: + free_page((unsigned long)vcpu->pio_data); +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return -ENOMEM; } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); +EXPORT_SYMBOL_GPL(kvm_vcpu_init); -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_mmu_destroy(vcpu); + if (vcpu->apic) + hrtimer_cancel(&vcpu->apic->timer.dev); + kvm_free_apic(vcpu->apic); + free_page((unsigned long)vcpu->pio_data); + free_page((unsigned long)vcpu->run); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) +static struct kvm *kvm_create_vm(void) { - ++vcpu->stat.pf_guest; - if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + kvm_io_bus_init(&kvm->pio_bus); + mutex_init(&kvm->lock); + INIT_LIST_HEAD(&kvm->active_mmu_pages); + kvm_io_bus_init(&kvm->mmio_bus); + spin_lock(&kvm_lock); + list_add(&kvm->vm_list, &vm_list); + spin_unlock(&kvm_lock); + return kvm; +} + +/* + * Free any memory in @free but not in @dont. + */ +static void kvm_free_physmem_slot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + if (!dont || free->phys_mem != dont->phys_mem) + if (free->phys_mem) { + for (i = 0; i < free->npages; ++i) + if (free->phys_mem[i]) + __free_page(free->phys_mem[i]); + vfree(free->phys_mem); + } + + if (!dont || free->dirty_bitmap != dont->dirty_bitmap) + vfree(free->dirty_bitmap); + + free->phys_mem = NULL; + free->npages = 0; + free->dirty_bitmap = NULL; +} + +static void kvm_free_physmem(struct kvm *kvm) +{ + int i; + + for (i = 0; i < kvm->nmemslots; ++i) + kvm_free_physmem_slot(&kvm->memslots[i], NULL); +} + +static void free_pio_guest_pages(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) + if (vcpu->pio.guest_pages[i]) { + __free_page(vcpu->pio.guest_pages[i]); + vcpu->pio.guest_pages[i] = NULL; + } +} + +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + +static void kvm_free_vcpus(struct kvm *kvm) +{ + unsigned int i; + + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm_unload_vcpu_mmu(kvm->vcpus[i]); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_x86_ops->vcpu_free(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } } - vcpu->arch.cr2 = addr; - kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); + +} + +static void kvm_destroy_vm(struct kvm *kvm) +{ + spin_lock(&kvm_lock); + list_del(&kvm->vm_list); + spin_unlock(&kvm_lock); + kvm_io_bus_destroy(&kvm->pio_bus); + kvm_io_bus_destroy(&kvm->mmio_bus); + kfree(kvm->vpic); + kfree(kvm->vioapic); + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + kfree(kvm); } -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +static int kvm_vm_release(struct inode *inode, struct file *filp) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + struct kvm *kvm = filp->private_data; + + kvm_destroy_vm(kvm); + return 0; } -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -static void __queue_exception(struct kvm_vcpu *vcpu) +static void inject_gp(struct kvm_vcpu *vcpu) { - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + kvm_x86_ops->inject_gp(vcpu, 0); } /* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; + u64 *pdpt; int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + struct page *page; + u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; - down_read(¤t->mm->mmap_sem); - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); - if (ret < 0) { + mutex_lock(&vcpu->kvm->lock); + page = gfn_to_page(vcpu->kvm, pdpt_gfn); + if (!page) { ret = 0; goto out; } + + pdpt = kmap_atomic(page, KM_USER0); + memcpy(pdpte, pdpt+offset, sizeof(pdpte)); + kunmap_atomic(pdpt, KM_USER0); + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { ret = 0; @@ -196,96 +421,78 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); out: - up_read(¤t->mm->mmap_sem); + mutex_unlock(&vcpu->kvm->lock); return ret; } -static bool pdptrs_changed(struct kvm_vcpu *vcpu) -{ - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - bool changed = true; - int r; - - if (is_long_mode(vcpu) || !is_pae(vcpu)) - return false; - - down_read(¤t->mm->mmap_sem); - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); - if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; -out: - up_read(¤t->mm->mmap_sem); - - return changed; -} - void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); - kvm_inject_gp(vcpu, 0); + cr0, vcpu->cr0); + inject_gp(vcpu); return; } if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { printk(KERN_DEBUG "set_cr0: #GP, set PG flag " "and a clear PE flag\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.shadow_efer & EFER_LME)) { + if ((vcpu->shadow_efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while PAE is disabled\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while CS.L == 1\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr0: #GP, pdptrs " "reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } } kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->arch.cr0 = cr0; + vcpu->cr0 = cr0; + mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(lmsw); @@ -293,7 +500,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } @@ -301,38 +508,35 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!(cr4 & X86_CR4_PAE)) { printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " "in long mode\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } if (cr4 & X86_CR4_VMXE) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; + vcpu->cr4 = cr4; + mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { - kvm_mmu_flush_tlb(vcpu); - return; - } - if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } } else { @@ -340,23 +544,26 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 & CR3_PAE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { printk(KERN_DEBUG "set_cr3: #GP, pdptrs " "reserved bits\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); + return; + } + } else { + if (cr3 & CR3_NONPAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + inject_gp(vcpu); return; } } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ } - down_read(¤t->mm->mmap_sem); + mutex_lock(&vcpu->kvm->lock); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -367,12 +574,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); + vcpu->cr3 = cr3; + vcpu->mmu.new_cr3(vcpu); } - up_read(¤t->mm->mmap_sem); + mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr3); @@ -380,13 +587,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return; } if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else - vcpu->arch.cr8 = cr8; + vcpu->cr8 = cr8; } EXPORT_SYMBOL_GPL(set_cr8); @@ -395,1589 +602,1157 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) return kvm_lapic_get_cr8(vcpu); else - return vcpu->arch.cr8; + return vcpu->cr8; } EXPORT_SYMBOL_GPL(get_cr8); -/* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * This list is modified at module load time to reflect the - * capabilities of the host cpu. - */ -static u32 msrs_to_save[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TIME_STAMP_COUNTER, -}; - -static unsigned num_msrs_to_save; - -static u32 emulated_msrs[] = { - MSR_IA32_MISC_ENABLE, -}; - -#ifdef CONFIG_X86_64 - -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { - if (efer & EFER_RESERVED_BITS) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - kvm_inject_gp(vcpu, 0); - return; - } + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->apic_base; + else + return vcpu->apic_base; +} +EXPORT_SYMBOL_GPL(kvm_get_apic_base); - if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - kvm_inject_gp(vcpu, 0); - return; - } +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +{ + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->apic_base = data; +} +EXPORT_SYMBOL_GPL(kvm_set_apic_base); - kvm_x86_ops->set_efer(vcpu, efer); +void fx_init(struct kvm_vcpu *vcpu) +{ + unsigned after_mxcsr_mask; - efer &= ~EFER_LMA; - efer |= vcpu->arch.shadow_efer & EFER_LMA; + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + fx_save(&vcpu->host_fx_image); + fpu_init(); + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); + preempt_enable(); - vcpu->arch.shadow_efer = efer; + vcpu->cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } - -#endif +EXPORT_SYMBOL_GPL(fx_init); /* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. + * Allocate some memory and give it an address in the guest physical address + * space. + * + * Discontiguous memory is allowed, mostly for framebuffers. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, + struct kvm_memory_region *mem) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); -} + int r; + gfn_t base_gfn; + unsigned long npages; + unsigned long i; + struct kvm_memory_slot *memslot; + struct kvm_memory_slot old, new; -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_set_msr(vcpu, index, *data); -} + r = -EINVAL; + /* General sanity checks */ + if (mem->memory_size & (PAGE_SIZE - 1)) + goto out; + if (mem->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (mem->slot >= KVM_MEMORY_SLOTS) + goto out; + if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) + goto out; + memslot = &kvm->memslots[mem->slot]; + base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; + npages = mem->memory_size >> PAGE_SHIFT; -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { -#ifdef CONFIG_X86_64 - case MSR_EFER: - set_efer(vcpu, data); - break; -#endif - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __FUNCTION__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case 0x200 ... 0x2ff: /* MTRRs */ - break; - case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; - case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; - break; - default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); - return 1; + if (!npages) + mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + + mutex_lock(&kvm->lock); + + new = old = *memslot; + + new.base_gfn = base_gfn; + new.npages = npages; + new.flags = mem->flags; + + /* Disallow changing a memory slot's size. */ + r = -EINVAL; + if (npages && old.npages && npages != old.npages) + goto out_unlock; + + /* Check for overlaps */ + r = -EEXIST; + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *s = &kvm->memslots[i]; + + if (s == memslot) + continue; + if (!((base_gfn + npages <= s->base_gfn) || + (base_gfn >= s->base_gfn + s->npages))) + goto out_unlock; } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_msr_common); + /* Deallocate if slot is being removed */ + if (!npages) + new.phys_mem = NULL; -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); -} + /* Free page dirty bitmap if unneeded */ + if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) + new.dirty_bitmap = NULL; -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; + r = -ENOMEM; - switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_UCODE_REV: - case MSR_IA32_PERF_STATUS: - case MSR_IA32_EBL_CR_POWERON: - /* MTRR registers */ - case 0xfe: - case 0x200 ... 0x2ff: - data = 0; - break; - case 0xcd: /* fsb frequency */ - data = 3; - break; - case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); - break; - case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; - break; -#ifdef CONFIG_X86_64 - case MSR_EFER: - data = vcpu->arch.shadow_efer; - break; -#endif - default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; + /* Allocate if a slot is being created */ + if (npages && !new.phys_mem) { + new.phys_mem = vmalloc(npages * sizeof(struct page *)); + + if (!new.phys_mem) + goto out_unlock; + + memset(new.phys_mem, 0, npages * sizeof(struct page *)); + for (i = 0; i < npages; ++i) { + new.phys_mem[i] = alloc_page(GFP_HIGHUSER + | __GFP_ZERO); + if (!new.phys_mem[i]) + goto out_unlock; + set_page_private(new.phys_mem[i],0); + } } - *pdata = data; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_msr_common); -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data)) -{ - int i; + /* Allocate page dirty bitmap if needed */ + if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { + unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; - vcpu_load(vcpu); + new.dirty_bitmap = vmalloc(dirty_bytes); + if (!new.dirty_bitmap) + goto out_unlock; + memset(new.dirty_bitmap, 0, dirty_bytes); + } - for (i = 0; i < msrs->nmsrs; ++i) - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; + if (mem->slot >= kvm->nmemslots) + kvm->nmemslots = mem->slot + 1; - vcpu_put(vcpu); + *memslot = new; - return i; + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->lock); + + kvm_free_physmem_slot(&old, &new); + return 0; + +out_unlock: + mutex_unlock(&kvm->lock); + kvm_free_physmem_slot(&new, &old); +out: + return r; } /* - * Read or write a bunch of msrs. Parameters are user addresses. - * - * @return number of msrs set successfully. + * Get (and clear) the dirty memory log for a memory slot. */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data), - int writeback) +static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) { - struct kvm_msrs msrs; - struct kvm_msr_entry *entries; - int r, n; - unsigned size; + struct kvm_memory_slot *memslot; + int r, i; + int n; + unsigned long any = 0; - r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) - goto out; + mutex_lock(&kvm->lock); - r = -E2BIG; - if (msrs.nmsrs >= MAX_IO_MSRS) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) goto out; - r = -ENOMEM; - size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = vmalloc(size); - if (!entries) + memslot = &kvm->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) goto out; - r = -EFAULT; - if (copy_from_user(entries, user_msrs->entries, size)) - goto out_free; + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; + for (i = 0; !any && i < n/sizeof(long); ++i) + any = memslot->dirty_bitmap[i]; r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; - r = n; + /* If nothing is dirty, don't bother messing with page tables. */ + if (any) { + kvm_mmu_slot_remove_write_access(kvm, log->slot); + kvm_flush_remote_tlbs(kvm); + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; -out_free: - vfree(entries); out: + mutex_unlock(&kvm->lock); return r; } /* - * Make sure that a cpu that is being hot-unplugged does not have any vcpus - * cached on it. + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. */ -void decache_vcpus_on_cpu(int cpu) +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) { - struct kvm *vm; - struct kvm_vcpu *vcpu; - int i; + int r, n; + struct kvm_mem_alias *p; - spin_lock(&kvm_lock); - list_for_each_entry(vm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = vm->vcpus[i]; - if (!vcpu) - continue; - /* - * If the vcpu is locked, then it is running on some - * other cpu and therefore it is not cached on the - * cpu in question. - * - * If it's not locked, check the last cpu it executed - * on. - */ - if (mutex_trylock(&vcpu->mutex)) { - if (vcpu->cpu == cpu) { - kvm_x86_ops->vcpu_decache(vcpu); - vcpu->cpu = -1; - } - mutex_unlock(&vcpu->mutex); - } - } - spin_unlock(&kvm_lock); + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + mutex_lock(&kvm->lock); + + p = &kvm->aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (kvm->aliases[n - 1].npages) + break; + kvm->naliases = n; + + kvm_mmu_zap_all(kvm); + + mutex_unlock(&kvm->lock); + + return 0; + +out: + return r; } -int kvm_dev_ioctl_check_extension(long ext) +static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: - case KVM_CAP_SET_TSS_ADDR: - case KVM_CAP_EXT_CPUID: - r = 1; - break; - case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy (&chip->chip.pic, + &pic_irqchip(kvm)->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy (&chip->chip.pic, + &pic_irqchip(kvm)->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy (&chip->chip.ioapic, + ioapic_irqchip(kvm), + sizeof(struct kvm_ioapic_state)); break; default: - r = 0; + r = -EINVAL; break; } return r; - } -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { - void __user *argp = (void __user *)arg; - long r; - - switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned n; + int r; - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < num_msrs_to_save) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) - goto out; - if (copy_to_user(user_msr_list->indices - + num_msrs_to_save * sizeof(u32), - &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) - goto out; - r = 0; + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy (&pic_irqchip(kvm)->pics[0], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy (&pic_irqchip(kvm)->pics[1], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy (ioapic_irqchip(kvm), + &chip->chip.ioapic, + sizeof(struct kvm_ioapic_state)); break; - } default: r = -EINVAL; + break; } -out: + kvm_pic_update_irq(pic_irqchip(kvm)); return r; } -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - kvm_x86_ops->vcpu_load(vcpu, cpu); -} - -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); -} - -static int is_efer_nx(void) +static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { - u64 efer; + int i; + struct kvm_mem_alias *alias; - rdmsrl(MSR_EFER, efer); - return efer & EFER_NX; + for (i = 0; i < kvm->naliases; ++i) { + alias = &kvm->aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; } -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { int i; - struct kvm_cpuid_entry2 *e, *entry; - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); + for (i = 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + + if (gfn >= memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; } + return NULL; } -/* when an old userspace process fills a new kernel module */ -static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { - int r, i; - struct kvm_cpuid_entry *cpuid_entries; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; + gfn = unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); } -static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - return 0; + struct kvm_memory_slot *slot; -out: - return r; + gfn = unalias_gfn(kvm, gfn); + slot = __gfn_to_memslot(kvm, gfn); + if (!slot) + return NULL; + return slot->phys_mem[gfn - slot->base_gfn]; } +EXPORT_SYMBOL_GPL(gfn_to_page); -static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +/* WARNING: Does not work on aliased pages. */ +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { - int r; - - r = -E2BIG; - if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; + struct kvm_memory_slot *memslot; -out: - cpuid->nent = vcpu->arch.cpuid_nent; - return r; -} + memslot = __gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn = gfn - memslot->base_gfn; -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) -{ - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; -} - -static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) -{ - const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | - bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | - bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); - const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | - bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | - bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | - bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | - bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | - bit(X86_FEATURE_PGE) | - bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | - bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | - bit(X86_FEATURE_SYSCALL) | - (bit(X86_FEATURE_NX) && is_efer_nx()) | -#ifdef CONFIG_X86_64 - bit(X86_FEATURE_LM) | -#endif - bit(X86_FEATURE_MMXEXT) | - bit(X86_FEATURE_3DNOWEXT) | - bit(X86_FEATURE_3DNOW); - const u32 kvm_supported_word3_x86_features = - bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); - const u32 kvm_supported_word6_x86_features = - bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); - - /* all func 2 cpuid_count() should be called on the same cpu */ - get_cpu(); - do_cpuid_1_ent(entry, function, index); - ++*nent; - - switch (function) { - case 0: - entry->eax = min(entry->eax, (u32)0xb); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - entry->ecx &= kvm_supported_word3_x86_features; - break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - for (t = 1; t < times && *nent < maxnent; ++t) { - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 and 0xb have additional index. */ - case 4: { - int index, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - cache_type = entry[index - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xb: { - int index, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - level_type = entry[index - 1].ecx & 0xff; - if (!level_type) - break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - entry->ecx &= kvm_supported_word6_x86_features; - break; + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); } - put_cpu(); } -static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG; - u32 func; + void *data = val; - if (cpuid->nent < 1) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); - if (!cpuid_entries) - goto out; + while (bytes) { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned long pfn; + struct page *page; + void *page_virt; - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + pfn = gpa >> PAGE_SHIFT; + page = gfn_to_page(vcpu->kvm, pfn); + if (!page) + return X86EMUL_UNHANDLEABLE; + page_virt = kmap_atomic(page, KM_USER0); - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; + memcpy(data, page_virt + offset, tocopy); -out_free: - vfree(cpuid_entries); -out: - return r; -} + kunmap_atomic(page_virt, KM_USER0); -static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - vcpu_load(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); + bytes -= tocopy; + data += tocopy; + addr += tocopy; + } - return 0; + return X86EMUL_CONTINUE; } +EXPORT_SYMBOL_GPL(emulator_read_std); -static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int emulator_write_std(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); - vcpu_put(vcpu); - - return 0; + pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); + return X86EMUL_UNHANDLEABLE; } -static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) +/* + * Only apic need an MMIO device hook, so shortcut now.. + */ +static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, + gpa_t addr) { - if (irq->irq < 0 || irq->irq >= 256) - return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) - return -ENXIO; - vcpu_load(vcpu); + struct kvm_io_device *dev; - set_bit(irq->irq, vcpu->arch.irq_pending); - set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + if (vcpu->apic) { + dev = &vcpu->apic->dev; + if (dev->in_range(dev, addr)) + return dev; + } + return NULL; +} - vcpu_put(vcpu); +static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + struct kvm_io_device *dev; - return 0; + dev = vcpu_find_pervcpu_dev(vcpu, addr); + if (dev == NULL) + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + return dev; } -static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, - struct kvm_tpr_access_ctl *tac) +static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) { - if (tac->flags) - return -EINVAL; - vcpu->arch.tpr_access_reporting = !!tac->enabled; - return 0; + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); } -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int emulator_read_emulated(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r; + struct kvm_io_device *mmio_dev; + gpa_t gpa; - switch (ioctl) { - case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } else if (emulator_read_std(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) - goto out; - r = 0; - break; - } - case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; + gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; - r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) - goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; - if (r) - goto out; - r = 0; - break; + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_read(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; } - case KVM_INTERRUPT: { - struct kvm_interrupt irq; - r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; - break; - } - case KVM_SET_CPUID: { - struct kvm_cpuid __user *cpuid_arg = argp; - struct kvm_cpuid cpuid; + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_SET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + return X86EMUL_UNHANDLEABLE; +} - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_GET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; +static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + struct page *page; + void *virt; - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); - break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); - break; - case KVM_TPR_ACCESS_REPORTING: { - struct kvm_tpr_access_ctl tac; + if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) + return 0; + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) + return 0; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); + virt = kmap_atomic(page, KM_USER0); + kvm_mmu_pte_write(vcpu, gpa, val, bytes); + memcpy(virt + offset_in_page(gpa), val, bytes); + kunmap_atomic(virt, KM_USER0); + return 1; +} - r = -EFAULT; - if (copy_from_user(&tac, argp, sizeof tac)) - goto out; - r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) - goto out; - r = 0; - break; - }; - case KVM_SET_VAPIC_ADDR: { - struct kvm_vapic_addr va; +static int emulator_write_emulated_onepage(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + struct kvm_io_device *mmio_dev; + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); - r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) - goto out; - r = -EFAULT; - if (copy_from_user(&va, argp, sizeof va)) - goto out; - r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); - break; + if (gpa == UNMAPPED_GVA) { + kvm_x86_ops->inject_page_fault(vcpu, addr, 2); + return X86EMUL_PROPAGATE_FAULT; } - default: - r = -EINVAL; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + + /* + * Is this MMIO handled locally? + */ + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_write(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; } -out: - return r; + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return X86EMUL_CONTINUE; } -static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - int ret; + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; - if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); - return ret; + now = -addr & ~PAGE_MASK; + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } +EXPORT_SYMBOL_GPL(emulator_write_emulated); -static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) +static int emulator_cmpxchg_emulated(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu) { - if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) - return -EINVAL; + static int reported; - down_write(¤t->mm->mmap_sem); + if (!reported) { + reported = 1; + printk(KERN_WARNING "kvm: emulating exchange as write\n"); + } + return emulator_write_emulated(addr, new, bytes, vcpu); +} - kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); - kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_ops->get_segment_base(vcpu, seg); +} - up_write(¤t->mm->mmap_sem); - return 0; +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + return X86EMUL_CONTINUE; } -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +int emulate_clts(struct kvm_vcpu *vcpu) { - return kvm->arch.n_alloc_mmu_pages; + kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); + return X86EMUL_CONTINUE; } -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) { - int i; - struct kvm_mem_alias *alias; + struct kvm_vcpu *vcpu = ctxt->vcpu; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; + switch (dr) { + case 0 ... 3: + *dest = kvm_x86_ops->get_dr(vcpu, dr); + return X86EMUL_CONTINUE; + default: + pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); + return X86EMUL_UNHANDLEABLE; } - return gfn; } -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - int r, n; - struct kvm_mem_alias *p; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - down_write(¤t->mm->mmap_sem); + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; + int exception; - p = &kvm->arch.aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + if (exception) { + /* FIXME: better handling */ + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_CONTINUE; +} - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) - break; - kvm->arch.naliases = n; +void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) +{ + static int reported; + u8 opcodes[4]; + unsigned long rip = vcpu->rip; + unsigned long rip_linear; - kvm_mmu_zap_all(kvm); + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - up_write(¤t->mm->mmap_sem); + if (reported) + return; - return 0; + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); -out: - return r; + printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + reported = 1; } +EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +struct x86_emulate_ops emulate_ops = { + .read_std = emulator_read_std, + .write_std = emulator_write_std, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, + struct kvm_run *run, + unsigned long cr2, + u16 error_code) { + struct x86_emulate_ctxt emulate_ctxt; int r; + int cs_db, cs_l; - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, - ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); - break; - default: - r = -EINVAL; - break; + vcpu->mmio_fault_cr2 = cr2; + kvm_x86_ops->cache_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + emulate_ctxt.vcpu = vcpu; + emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + emulate_ctxt.cr2 = cr2; + emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { + emulate_ctxt.cs_base = 0; + emulate_ctxt.ds_base = 0; + emulate_ctxt.es_base = 0; + emulate_ctxt.ss_base = 0; + } else { + emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); + emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); + emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); + emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); } - return r; -} -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; + emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); + emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic_irqchip(kvm)); - return r; -} + vcpu->mmio_is_write = 0; + vcpu->pio.string = 0; + r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); + if (vcpu->pio.string) + return EMULATE_DO_MMIO; -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) -{ - int r; - int n; - struct kvm_memory_slot *memslot; - int is_dirty = 0; + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } - down_write(¤t->mm->mmap_sem); + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - kvm_mmu_slot_remove_write_access(kvm, log->slot); - kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; } - r = 0; -out: - up_write(¤t->mm->mmap_sem); - return r; + + return EMULATE_DONE; } +EXPORT_SYMBOL_GPL(emulate_instruction); -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +/* + * The vCPU has executed a HLT instruction with in-kernel mode enabled. + */ +static void kvm_vcpu_block(struct kvm_vcpu *vcpu) { - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - int r = -EINVAL; - - switch (ioctl) { - case KVM_SET_TSS_ADDR: - r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; - break; - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } - case KVM_SET_NR_MMU_PAGES: - r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; - break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; + DECLARE_WAITQUEUE(wait, current); - r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); - if (r) - goto out; - break; - } - case KVM_CREATE_IRQCHIP: - r = -ENOMEM; - kvm->arch.vpic = kvm_create_pic(kvm); - if (kvm->arch.vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - kfree(kvm->arch.vpic); - kvm->arch.vpic = NULL; - goto out; - } - } else - goto out; - break; - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; + add_wait_queue(&vcpu->wq, &wait); - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - if (irqchip_in_kernel(kvm)) { - mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); - mutex_unlock(&kvm->lock); - r = 0; - } - break; + /* + * We will block until either an interrupt or a signal wakes us up + */ + while (!kvm_cpu_has_interrupt(vcpu) + && !signal_pending(current) + && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE + && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { + set_current_state(TASK_INTERRUPTIBLE); + vcpu_put(vcpu); + schedule(); + vcpu_load(vcpu); } - case KVM_GET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; - r = 0; - break; - } - case KVM_SET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + __set_current_state(TASK_RUNNING); + remove_wait_queue(&vcpu->wq, &wait); +} - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) - goto out; - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); - if (r) - goto out; - r = 0; - break; +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.halt_exits; + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->mp_state = VCPU_MP_STATE_HALTED; + kvm_vcpu_block(vcpu); + if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) + return -EINTR; + return 1; + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return 0; } - case KVM_GET_SUPPORTED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, - cpuid_arg->entries); - if (r) - goto out; +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; + kvm_x86_ops->cache_regs(vcpu); + ret = -KVM_EINVAL; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + nr = vcpu->regs[VCPU_REGS_RAX]; + a0 = vcpu->regs[VCPU_REGS_RDI]; + a1 = vcpu->regs[VCPU_REGS_RSI]; + a2 = vcpu->regs[VCPU_REGS_RDX]; + a3 = vcpu->regs[VCPU_REGS_RCX]; + a4 = vcpu->regs[VCPU_REGS_R8]; + a5 = vcpu->regs[VCPU_REGS_R9]; + } else +#endif + { + nr = vcpu->regs[VCPU_REGS_RBX] & -1u; + a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; + a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; + a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; + a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; + a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; + a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; } + switch (nr) { default: - ; + run->hypercall.nr = nr; + run->hypercall.args[0] = a0; + run->hypercall.args[1] = a1; + run->hypercall.args[2] = a2; + run->hypercall.args[3] = a3; + run->hypercall.args[4] = a4; + run->hypercall.args[5] = a5; + run->hypercall.ret = ret; + run->hypercall.longmode = is_long_mode(vcpu); + kvm_x86_ops->decache_regs(vcpu); + return 0; } -out: - return r; + vcpu->regs[VCPU_REGS_RAX] = ret; + kvm_x86_ops->decache_regs(vcpu); + return 1; } +EXPORT_SYMBOL_GPL(kvm_hypercall); -static void kvm_init_msr_list(void) +static u64 mk_cr_64(u64 curr_cr, u32 new_val) { - u32 dummy[2]; - unsigned i, j; - - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) - continue; - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; - } - num_msrs_to_save = j; + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -/* - * Only apic need an MMIO device hook, so shortcut now.. - */ -static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr) +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_io_device *dev; + struct descriptor_table dt = { limit, base }; - if (vcpu->arch.apic) { - dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr)) - return dev; - } - return NULL; + kvm_x86_ops->set_gdt(vcpu, &dt); } - -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct kvm_io_device *dev; + struct descriptor_table dt = { limit, base }; - dev = vcpu_find_pervcpu_dev(vcpu, addr); - if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); - return dev; + kvm_x86_ops->set_idt(vcpu, &dt); } -int emulator_read_std(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) { - void *data = val; - int r = X86EMUL_CONTINUE; - - down_read(¤t->mm->mmap_sem); - while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - unsigned offset = addr & (PAGE_SIZE-1); - unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); - int ret; + lmsw(vcpu, msw); + *rflags = kvm_x86_ops->get_rflags(vcpu); +} - if (gpa == UNMAPPED_GVA) { - r = X86EMUL_PROPAGATE_FAULT; - goto out; - } - ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); - if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; - goto out; - } +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + switch (cr) { + case 0: + return vcpu->cr0; + case 2: + return vcpu->cr2; + case 3: + return vcpu->cr3; + case 4: + return vcpu->cr4; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} - bytes -= tocopy; - data += tocopy; - addr += tocopy; +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); + *rflags = kvm_x86_ops->get_rflags(vcpu); + break; + case 2: + vcpu->cr2 = val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); } -out: - up_read(¤t->mm->mmap_sem); - return r; } -EXPORT_SYMBOL_GPL(emulator_read_std); -static int emulator_read_emulated(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +/* + * Register the para guest with the host: + */ +static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) { - struct kvm_io_device *mmio_dev; - gpa_t gpa; + struct kvm_vcpu_para_state *para_state; + hpa_t para_state_hpa, hypercall_hpa; + struct page *para_state_page; + unsigned char *hypercall; + gpa_t hypercall_gpa; - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; - } + printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); + printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); + + /* + * Needs to be page aligned: + */ + if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) + goto err_gp; - down_read(¤t->mm->mmap_sem); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(¤t->mm->mmap_sem); + para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); + printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); + if (is_error_hpa(para_state_hpa)) + goto err_gp; - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; + mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); + para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); + para_state = kmap(para_state_page); - if (emulator_read_std(addr, val, bytes, vcpu) - == X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); + printk(KERN_DEBUG ".... size: %d\n", para_state->size); -mmio: + para_state->host_version = KVM_PARA_API_VERSION; /* - * Is this MMIO handled locally? + * We cannot support guests that try to register themselves + * with a newer API version than the host supports: */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); - if (mmio_dev) { - kvm_iodevice_read(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); - return X86EMUL_CONTINUE; + if (para_state->guest_version > KVM_PARA_API_VERSION) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; } - mutex_unlock(&vcpu->kvm->lock); - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + hypercall_gpa = para_state->hypercall_gpa; + hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); + printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); + if (is_error_hpa(hypercall_hpa)) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } - return X86EMUL_UNHANDLEABLE; -} + printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); + vcpu->para_state_page = para_state_page; + vcpu->para_state_gpa = para_state_gpa; + vcpu->hypercall_gpa = hypercall_gpa; -static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) -{ - int ret; + mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); + hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), + KM_USER1) + (hypercall_hpa & ~PAGE_MASK); + kvm_x86_ops->patch_hypercall(vcpu, hypercall); + kunmap_atomic(hypercall, KM_USER1); - down_read(¤t->mm->mmap_sem); - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) { - up_read(¤t->mm->mmap_sem); - return 0; - } - kvm_mmu_pte_write(vcpu, gpa, val, bytes); - up_read(¤t->mm->mmap_sem); + para_state->ret = 0; +err_kunmap_skip: + kunmap(para_state_page); + return 0; +err_gp: return 1; } -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { - struct kvm_io_device *mmio_dev; - gpa_t gpa; - - down_read(¤t->mm->mmap_sem); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(¤t->mm->mmap_sem); - - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); - return X86EMUL_PROPAGATE_FAULT; - } - - /* For APIC access vmexit */ - if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto mmio; - - if (emulator_write_phys(vcpu, gpa, val, bytes)) - return X86EMUL_CONTINUE; + u64 data; -mmio: - /* - * Is this MMIO handled locally? - */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); - if (mmio_dev) { - kvm_iodevice_write(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); - return X86EMUL_CONTINUE; + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->ia32_misc_enable_msr; + break; +#ifdef CONFIG_X86_64 + case MSR_EFER: + data = vcpu->shadow_efer; + break; +#endif + default: + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; } - mutex_unlock(&vcpu->kvm->lock); - - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); - - return X86EMUL_CONTINUE; + *pdata = data; + return 0; } +EXPORT_SYMBOL_GPL(kvm_get_msr_common); -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } -EXPORT_SYMBOL_GPL(emulator_write_emulated); -static int emulator_cmpxchg_emulated(unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct kvm_vcpu *vcpu) +#ifdef CONFIG_X86_64 + +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - static int reported; + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + inject_gp(vcpu); + return; + } - if (!reported) { - reported = 1; - printk(KERN_WARNING "kvm: emulating exchange as write\n"); + if (is_paging(vcpu) + && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + inject_gp(vcpu); + return; } -#ifndef CONFIG_X86_64 - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes == 8) { - gpa_t gpa; - struct page *page; - char *addr; - u64 val; - down_read(¤t->mm->mmap_sem); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + kvm_x86_ops->set_efer(vcpu, efer); - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; + efer &= ~EFER_LMA; + efer |= vcpu->shadow_efer & EFER_LMA; - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; + vcpu->shadow_efer = efer; +} - val = *(u64 *)new; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - addr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(addr + offset_in_page(gpa)), val); - kunmap_atomic(addr, KM_USER0); - kvm_release_page_dirty(page); - emul_write: - up_read(¤t->mm->mmap_sem); - } #endif - return emulator_write_emulated(addr, new, bytes, vcpu); -} - -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + switch (msr) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_MCG_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case MSR_IA32_MISC_ENABLE: + vcpu->ia32_misc_enable_msr = data; + break; + /* + * This is the 'probe whether the host is KVM' logic: + */ + case MSR_KVM_API_MAGIC: + return vcpu_register_para(vcpu, data); + + default: + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); + return 1; + } + return 0; } +EXPORT_SYMBOL_GPL(kvm_set_msr_common); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { - return X86EMUL_CONTINUE; + return kvm_x86_ops->set_msr(vcpu, msr_index, data); } -int emulate_clts(struct kvm_vcpu *vcpu) +void kvm_resched(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); - return X86EMUL_CONTINUE; + if (!need_resched()) + return; + cond_resched(); } +EXPORT_SYMBOL_GPL(kvm_resched); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = ctxt->vcpu; + int i; + u32 function; + struct kvm_cpuid_entry *e, *best; - switch (dr) { - case 0 ... 3: - *dest = kvm_x86_ops->get_dr(vcpu, dr); - return X86EMUL_CONTINUE; - default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); - return X86EMUL_UNHANDLEABLE; - } -} - -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) -{ - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - int exception; - - kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); - if (exception) { - /* FIXME: better handling */ - return X86EMUL_UNHANDLEABLE; - } - return X86EMUL_CONTINUE; -} - -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -{ - static int reported; - u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; - unsigned long rip_linear; - - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - if (reported) - return; - - emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); - reported = 1; -} -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); - -struct x86_emulate_ops emulate_ops = { - .read_std = emulator_read_std, - .read_emulated = emulator_read_emulated, - .write_emulated = emulator_write_emulated, - .cmpxchg_emulated = emulator_cmpxchg_emulated, -}; - -int emulate_instruction(struct kvm_vcpu *vcpu, - struct kvm_run *run, - unsigned long cr2, - u16 error_code, - int emulation_type) -{ - int r; - struct decode_cache *c; - - vcpu->arch.mmio_fault_cr2 = cr2; kvm_x86_ops->cache_regs(vcpu); - - vcpu->mmio_is_write = 0; - vcpu->arch.pio.string = 0; - - if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.mode = - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - - if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { - vcpu->arch.emulate_ctxt.cs_base = 0; - vcpu->arch.emulate_ctxt.ds_base = 0; - vcpu->arch.emulate_ctxt.es_base = 0; - vcpu->arch.emulate_ctxt.ss_base = 0; - } else { - vcpu->arch.emulate_ctxt.cs_base = - get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.emulate_ctxt.ds_base = - get_segment_base(vcpu, VCPU_SREG_DS); - vcpu->arch.emulate_ctxt.es_base = - get_segment_base(vcpu, VCPU_SREG_ES); - vcpu->arch.emulate_ctxt.ss_base = - get_segment_base(vcpu, VCPU_SREG_SS); - } - - vcpu->arch.emulate_ctxt.gs_base = - get_segment_base(vcpu, VCPU_SREG_GS); - vcpu->arch.emulate_ctxt.fs_base = - get_segment_base(vcpu, VCPU_SREG_FS); - - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - - /* Reject the instructions other than VMCALL/VMMCALL when - * try to emulate invalid opcode */ - c = &vcpu->arch.emulate_ctxt.decode; - if ((emulation_type & EMULTYPE_TRAP_UD) && - (!(c->twobyte && c->b == 0x01 && - (c->modrm_reg == 0 || c->modrm_reg == 3) && - c->modrm_mod == 3 && c->modrm_rm == 1))) - return EMULATE_FAIL; - - ++vcpu->stat.insn_emulation; - if (r) { - ++vcpu->stat.insn_emulation_fail; - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; - return EMULATE_FAIL; + function = vcpu->regs[VCPU_REGS_RAX]; + vcpu->regs[VCPU_REGS_RAX] = 0; + vcpu->regs[VCPU_REGS_RBX] = 0; + vcpu->regs[VCPU_REGS_RCX] = 0; + vcpu->regs[VCPU_REGS_RDX] = 0; + best = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == function) { + best = e; + break; } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; } - - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - - if (vcpu->arch.pio.string) - return EMULATE_DO_MMIO; - - if ((r || vcpu->mmio_is_write) && run) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } - - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; - if (!vcpu->mmio_needed) { - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } - return EMULATE_DO_MMIO; + if (best) { + vcpu->regs[VCPU_REGS_RAX] = best->eax; + vcpu->regs[VCPU_REGS_RBX] = best->ebx; + vcpu->regs[VCPU_REGS_RCX] = best->ecx; + vcpu->regs[VCPU_REGS_RDX] = best->edx; } - kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } - - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(emulate_instruction); - -static void free_pio_guest_pages(struct kvm_vcpu *vcpu) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) - if (vcpu->arch.pio.guest_pages[i]) { - kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); - vcpu->arch.pio.guest_pages[i] = NULL; - } + kvm_x86_ops->skip_emulated_instruction(vcpu); } +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); static int pio_copy_data(struct kvm_vcpu *vcpu) { - void *p = vcpu->arch.pio_data; + void *p = vcpu->pio_data; void *q; unsigned bytes; - int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; + int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; - q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, + q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, PAGE_KERNEL); if (!q) { free_pio_guest_pages(vcpu); return -ENOMEM; } - q += vcpu->arch.pio.guest_page_offset; - bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; - if (vcpu->arch.pio.in) + q += vcpu->pio.guest_page_offset; + bytes = vcpu->pio.size * vcpu->pio.cur_count; + if (vcpu->pio.in) memcpy(q, p, bytes); else memcpy(p, q, bytes); - q -= vcpu->arch.pio.guest_page_offset; + q -= vcpu->pio.guest_page_offset; vunmap(q); free_pio_guest_pages(vcpu); return 0; } -int complete_pio(struct kvm_vcpu *vcpu) +static int complete_pio(struct kvm_vcpu *vcpu) { - struct kvm_pio_request *io = &vcpu->arch.pio; + struct kvm_pio_request *io = &vcpu->pio; long delta; int r; @@ -1985,7 +1760,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (!io->string) { if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, + memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, io->size); } else { if (io->in) { @@ -2003,15 +1778,15 @@ int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + vcpu->regs[VCPU_REGS_RCX] -= delta; } if (io->down) delta = -delta; delta *= io->size; if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; + vcpu->regs[VCPU_REGS_RDI] += delta; else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; + vcpu->regs[VCPU_REGS_RSI] += delta; } kvm_x86_ops->decache_regs(vcpu); @@ -2029,13 +1804,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev, /* TODO: String I/O for in kernel device */ mutex_lock(&vcpu->kvm->lock); - if (vcpu->arch.pio.in) - kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, + if (vcpu->pio.in) + kvm_iodevice_read(pio_dev, vcpu->pio.port, + vcpu->pio.size, pd); else - kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, + kvm_iodevice_write(pio_dev, vcpu->pio.port, + vcpu->pio.size, pd); mutex_unlock(&vcpu->kvm->lock); } @@ -2043,8 +1818,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev, static void pio_string_write(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) { - struct kvm_pio_request *io = &vcpu->arch.pio; - void *pd = vcpu->arch.pio_data; + struct kvm_pio_request *io = &vcpu->pio; + void *pd = vcpu->pio_data; int i; mutex_lock(&vcpu->kvm->lock); @@ -2057,38 +1832,32 @@ static void pio_string_write(struct kvm_io_device *pio_dev, mutex_unlock(&vcpu->kvm->lock); } -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) -{ - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); -} - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.size = vcpu->pio.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 0; - vcpu->arch.pio.down = 0; - vcpu->arch.pio.guest_page_offset = 0; - vcpu->arch.pio.rep = 0; + vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; + vcpu->run->io.port = vcpu->pio.port = port; + vcpu->pio.in = in; + vcpu->pio.string = 0; + vcpu->pio.down = 0; + vcpu->pio.guest_page_offset = 0; + vcpu->pio.rep = 0; kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); + memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); pio_dev = vcpu_find_pio_dev(vcpu, port); if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); + kernel_pio(pio_dev, vcpu, vcpu->pio_data); complete_pio(vcpu); return 1; } @@ -2108,15 +1877,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.size = vcpu->pio.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 1; - vcpu->arch.pio.down = down; - vcpu->arch.pio.guest_page_offset = offset_in_page(address); - vcpu->arch.pio.rep = rep; + vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; + vcpu->run->io.port = vcpu->pio.port = port; + vcpu->pio.in = in; + vcpu->pio.string = 1; + vcpu->pio.down = down; + vcpu->pio.guest_page_offset = offset_in_page(address); + vcpu->pio.rep = rep; if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2142,35 +1911,37 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, * String I/O in reverse. Yuck. Kill the guest, fix later. */ pr_unimpl(vcpu, "guest string pio down\n"); - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); return 1; } vcpu->run->io.count = now; - vcpu->arch.pio.cur_count = now; + vcpu->pio.cur_count = now; - if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) + if (vcpu->pio.cur_count == vcpu->pio.count) kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - down_read(¤t->mm->mmap_sem); + mutex_lock(&vcpu->kvm->lock); page = gva_to_page(vcpu, address + i * PAGE_SIZE); - vcpu->arch.pio.guest_pages[i] = page; - up_read(¤t->mm->mmap_sem); + if (page) + get_page(page); + vcpu->pio.guest_pages[i] = page; + mutex_unlock(&vcpu->kvm->lock); if (!page) { - kvm_inject_gp(vcpu, 0); + inject_gp(vcpu); free_pio_guest_pages(vcpu); return 1; } } pio_dev = vcpu_find_pio_dev(vcpu, port); - if (!vcpu->arch.pio.in) { + if (!vcpu->pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); if (ret >= 0 && pio_dev) { pio_string_write(pio_dev, vcpu); complete_pio(vcpu); - if (vcpu->arch.pio.count == 0) + if (vcpu->pio.count == 0) ret = 1; } } else if (pio_dev) @@ -2182,427 +1953,112 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); -int kvm_arch_init(void *opaque) +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return (!vcpu->irq_summary && + kvm_run->request_interrupt_window && + vcpu->interrupt_window_open && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); +} + +static void post_kvm_run_save(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; + else + kvm_run->ready_for_interrupt_injection = + (vcpu->interrupt_window_open && + vcpu->irq_summary == 0); +} + +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - r = -EEXIST; - goto out; + if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { + printk("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->sipi_vector); + kvm_lapic_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu); + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; } - if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); - r = -EOPNOTSUPP; - goto out; - } - if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); - r = -EOPNOTSUPP; - goto out; - } +preempted: + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); - r = kvm_mmu_module_init(); - if (r) +again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) goto out; - kvm_init_msr_list(); - - kvm_x86_ops = ops; - kvm_mmu_set_nonpresent_ptes(0ull, 0ull); - return 0; + preempt_disable(); -out: - return r; -} + kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_load_guest_fpu(vcpu); -void kvm_arch_exit(void) -{ - kvm_x86_ops = NULL; - kvm_mmu_module_exit(); -} + local_irq_disable(); -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; - kvm_vcpu_block(vcpu); - if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) - return -EINTR; - return 1; - } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; - return 0; + if (signal_pending(current)) { + local_irq_enable(); + preempt_enable(); + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; } -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - kvm_x86_ops->cache_regs(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_x86_ops->inject_pending_irq(vcpu); + else if (!vcpu->mmio_read_completed) + kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + vcpu->guest_mode = 1; + kvm_guest_enter(); - if (!is_long_mode(vcpu)) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } + if (vcpu->requests) + if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - default: - ret = -KVM_ENOSYS; - break; - } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); + kvm_x86_ops->run(vcpu, kvm_run); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) -{ - char instruction[3]; - int ret = 0; + vcpu->guest_mode = 0; + local_irq_enable(); + ++vcpu->stat.exits; /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. + * We must have an instruction between local_irq_enable() and + * kvm_guest_exit(), so the timer interrupt isn't delayed by + * the interrupt shadow. The stat.exits increment will do nicely. + * But we need to prevent reordering, hence this barrier(): */ - kvm_mmu_zap_all(vcpu->kvm); - - kvm_x86_ops->cache_regs(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) - != X86EMUL_CONTINUE) - ret = -EFAULT; + barrier(); - return ret; -} + kvm_guest_exit(); -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} + preempt_enable(); -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + kvm_x86_ops->cache_regs(vcpu); + profile_hit(KVM_PROFILING, (void *)vcpu->rip); + } - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct descriptor_table dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); -} - -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags) -{ - lmsw(vcpu, msw); - *rflags = kvm_x86_ops->get_rflags(vcpu); -} - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) -{ - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - switch (cr) { - case 0: - return vcpu->arch.cr0; - case 2: - return vcpu->arch.cr2; - case 3: - return vcpu->arch.cr3; - case 4: - return vcpu->arch.cr4; - case 8: - return get_cr8(vcpu); - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - return 0; - } -} - -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, - unsigned long *rflags) -{ - switch (cr) { - case 0: - set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); - *rflags = kvm_x86_ops->get_rflags(vcpu); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - set_cr3(vcpu, val); - break; - case 4: - set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); - break; - case 8: - set_cr8(vcpu, val & 0xfUL); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); - } -} - -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - int i; - u32 function, index; - struct kvm_cpuid_entry2 *e, *best; - - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; - best = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; - } - if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; - } - kvm_x86_ops->decache_regs(vcpu); - kvm_x86_ops->skip_emulated_instruction(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); - -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - return (!vcpu->arch.irq_summary && - kvm_run->request_interrupt_window && - vcpu->arch.interrupt_window_open && - (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); -} - -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) -{ - kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else - kvm_run->ready_for_interrupt_injection = - (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary == 0); -} - -static void vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return; - - down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - vcpu->arch.apic->vapic_page = page; - up_read(¤t->mm->mmap_sem); -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic || !apic->vapic_addr) - return; - - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); -} - -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - int r; - - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - } - - vapic_enter(vcpu); - -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - -again: - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) - __kvm_migrate_apic_timer(vcpu); - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { - kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; - r = 0; - goto out; - } - } - - kvm_inject_pending_timer_irqs(vcpu); - - preempt_disable(); - - kvm_x86_ops->prepare_guest_switch(vcpu); - kvm_load_guest_fpu(vcpu); - - local_irq_disable(); - - if (need_resched()) { - local_irq_enable(); - preempt_enable(); - r = 1; - goto out; - } - - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - - if (vcpu->arch.exception.pending) - __queue_exception(vcpu); - else if (irqchip_in_kernel(vcpu->kvm)) - kvm_x86_ops->inject_pending_irq(vcpu); - else - kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); - - kvm_lapic_sync_to_vapic(vcpu); - - vcpu->guest_mode = 1; - kvm_guest_enter(); - - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); - - kvm_x86_ops->run(vcpu, kvm_run); - - vcpu->guest_mode = 0; - local_irq_enable(); - - ++vcpu->stat.exits; - - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); - - preempt_enable(); - - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); - } - - if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) - vcpu->arch.exception.pending = false; - - kvm_lapic_sync_from_vapic(vcpu); - - r = kvm_x86_ops->handle_exit(kvm_run, vcpu); + r = kvm_x86_ops->handle_exit(kvm_run, vcpu); if (r > 0) { if (dm_request_for_irq_injection(vcpu, kvm_run)) { @@ -2611,8 +2067,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ++vcpu->stat.request_irq_exits; goto out; } - if (!need_resched()) + if (!need_resched()) { + ++vcpu->stat.light_exits; goto again; + } } out: @@ -2623,19 +2081,18 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) post_kvm_run_save(vcpu, kvm_run); - vapic_exit(vcpu); - return r; } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) + +static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; sigset_t sigsaved; vcpu_load(vcpu); - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { + if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); vcpu_put(vcpu); return -EAGAIN; @@ -2648,19 +2105,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.cur_count) { + if (vcpu->pio.cur_count) { r = complete_pio(vcpu); if (r) goto out; } -#if CONFIG_HAS_IOMEM + if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; r = emulate_instruction(vcpu, kvm_run, - vcpu->arch.mmio_fault_cr2, 0, - EMULTYPE_NO_DECODE); + vcpu->mmio_fault_cr2, 0); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -2669,10 +2125,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } } -#endif + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; + vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; kvm_x86_ops->decache_regs(vcpu); } @@ -2686,32 +2142,33 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return r; } -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) { vcpu_load(vcpu); kvm_x86_ops->cache_regs(vcpu); - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; + regs->rax = vcpu->regs[VCPU_REGS_RAX]; + regs->rbx = vcpu->regs[VCPU_REGS_RBX]; + regs->rcx = vcpu->regs[VCPU_REGS_RCX]; + regs->rdx = vcpu->regs[VCPU_REGS_RDX]; + regs->rsi = vcpu->regs[VCPU_REGS_RSI]; + regs->rdi = vcpu->regs[VCPU_REGS_RDI]; + regs->rsp = vcpu->regs[VCPU_REGS_RSP]; + regs->rbp = vcpu->regs[VCPU_REGS_RBP]; #ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; + regs->r8 = vcpu->regs[VCPU_REGS_R8]; + regs->r9 = vcpu->regs[VCPU_REGS_R9]; + regs->r10 = vcpu->regs[VCPU_REGS_R10]; + regs->r11 = vcpu->regs[VCPU_REGS_R11]; + regs->r12 = vcpu->regs[VCPU_REGS_R12]; + regs->r13 = vcpu->regs[VCPU_REGS_R13]; + regs->r14 = vcpu->regs[VCPU_REGS_R14]; + regs->r15 = vcpu->regs[VCPU_REGS_R15]; #endif - regs->rip = vcpu->arch.rip; + regs->rip = vcpu->rip; regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -2725,30 +2182,31 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; + vcpu->regs[VCPU_REGS_RAX] = regs->rax; + vcpu->regs[VCPU_REGS_RBX] = regs->rbx; + vcpu->regs[VCPU_REGS_RCX] = regs->rcx; + vcpu->regs[VCPU_REGS_RDX] = regs->rdx; + vcpu->regs[VCPU_REGS_RSI] = regs->rsi; + vcpu->regs[VCPU_REGS_RDI] = regs->rdi; + vcpu->regs[VCPU_REGS_RSP] = regs->rsp; + vcpu->regs[VCPU_REGS_RBP] = regs->rbp; #ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; + vcpu->regs[VCPU_REGS_R8] = regs->r8; + vcpu->regs[VCPU_REGS_R9] = regs->r9; + vcpu->regs[VCPU_REGS_R10] = regs->r10; + vcpu->regs[VCPU_REGS_R11] = regs->r11; + vcpu->regs[VCPU_REGS_R12] = regs->r12; + vcpu->regs[VCPU_REGS_R13] = regs->r13; + vcpu->regs[VCPU_REGS_R14] = regs->r14; + vcpu->regs[VCPU_REGS_R15] = regs->r15; #endif - vcpu->arch.rip = regs->rip; + vcpu->rip = regs->rip; kvm_x86_ops->set_rflags(vcpu, regs->rflags); kvm_x86_ops->decache_regs(vcpu); @@ -2764,18 +2222,8 @@ static void get_segment(struct kvm_vcpu *vcpu, return kvm_x86_ops->get_segment(vcpu, var, seg); } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { struct descriptor_table dt; int pending_vec; @@ -2800,12 +2248,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->gdt.base = dt.base; kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; - sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr0 = vcpu->cr0; + sregs->cr2 = vcpu->cr2; + sregs->cr3 = vcpu->cr3; + sregs->cr4 = vcpu->cr4; sregs->cr8 = get_cr8(vcpu); - sregs->efer = vcpu->arch.shadow_efer; + sregs->efer = vcpu->shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) { @@ -2813,10 +2261,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sizeof sregs->interrupt_bitmap); pending_vec = kvm_x86_ops->get_irq(vcpu); if (pending_vec >= 0) - set_bit(pending_vec, - (unsigned long *)sregs->interrupt_bitmap); + set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); } else - memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, + memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, sizeof sregs->interrupt_bitmap); vcpu_put(vcpu); @@ -2830,8 +2277,8 @@ static void set_segment(struct kvm_vcpu *vcpu, return kvm_x86_ops->set_segment(vcpu, var, seg); } -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) +static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { int mmu_reset_needed = 0; int i, pending_vec, max_bits; @@ -2846,13 +2293,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, dt.base = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); - vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; + vcpu->cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->cr3 != sregs->cr3; + vcpu->cr3 = sregs->cr3; set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; + mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; #ifdef CONFIG_X86_64 kvm_x86_ops->set_efer(vcpu, sregs->efer); #endif @@ -2860,25 +2307,25 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->decache_cr4_guest_bits(vcpu); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; - vcpu->arch.cr0 = sregs->cr0; + mmu_reset_needed |= vcpu->cr0 != sregs->cr0; + vcpu->cr0 = sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= vcpu->cr4 != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->cr3); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); if (!irqchip_in_kernel(vcpu->kvm)) { - memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, - sizeof vcpu->arch.irq_pending); - vcpu->arch.irq_summary = 0; - for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) - if (vcpu->arch.irq_pending[i]) - __set_bit(i, &vcpu->arch.irq_summary); + memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->irq_pending); + vcpu->irq_summary = 0; + for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) + if (vcpu->irq_pending[i]) + __set_bit(i, &vcpu->irq_summary); } else { max_bits = (sizeof sregs->interrupt_bitmap) << 3; pending_vec = find_first_bit( @@ -2887,8 +2334,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Only pending external irq is handled here */ if (pending_vec < max_bits) { kvm_x86_ops->set_irq(vcpu, pending_vec); - pr_debug("Set back pending irq %d\n", - pending_vec); + printk("Set back pending irq %d\n", pending_vec); } } @@ -2902,386 +2348,1281 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - vcpu_put(vcpu); + vcpu_put(vcpu); + + return 0; +} + +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} +EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +static __init void kvm_init_msr_list(void) +{ + u32 dummy[2]; + unsigned i, j; + + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; + } + num_msrs_to_save = j; +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + int i; + + vcpu_load(vcpu); + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + vfree(entries); +out: + return r; +} + +/* + * Translate a guest virtual address to a guest physical address. + */ +static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + unsigned long vaddr = tr->linear_address; + gpa_t gpa; + + vcpu_load(vcpu); + mutex_lock(&vcpu->kvm->lock); + gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + mutex_unlock(&vcpu->kvm->lock); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; + vcpu_load(vcpu); + + set_bit(irq->irq, vcpu->irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg) +{ + int r; + + vcpu_load(vcpu); + + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + + vcpu_put(vcpu); + + return r; +} + +static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm_vcpu *vcpu = vma->vm_file->private_data; + unsigned long pgoff; + struct page *page; + + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (pgoff == 0) + page = virt_to_page(vcpu->run); + else if (pgoff == KVM_PIO_PAGE_OFFSET) + page = virt_to_page(vcpu->pio_data); + else + return NOPAGE_SIGBUS; + get_page(page); + if (type != NULL) + *type = VM_FAULT_MINOR; + + return page; +} + +static struct vm_operations_struct kvm_vcpu_vm_ops = { + .nopage = kvm_vcpu_nopage, +}; + +static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vcpu_vm_ops; + return 0; +} + +static int kvm_vcpu_release(struct inode *inode, struct file *filp) +{ + struct kvm_vcpu *vcpu = filp->private_data; + + fput(vcpu->kvm->filp); + return 0; +} + +static struct file_operations kvm_vcpu_fops = { + .release = kvm_vcpu_release, + .unlocked_ioctl = kvm_vcpu_ioctl, + .compat_ioctl = kvm_vcpu_ioctl, + .mmap = kvm_vcpu_mmap, +}; + +/* + * Allocates an inode for the vcpu. + */ +static int create_vcpu_fd(struct kvm_vcpu *vcpu) +{ + int fd, r; + struct inode *inode; + struct file *file; + + r = anon_inode_getfd(&fd, &inode, &file, + "kvm-vcpu", &kvm_vcpu_fops, vcpu); + if (r) + return r; + atomic_inc(&vcpu->kvm->filp->f_count); + return fd; +} + +/* + * Creates some virtual cpus. Good luck creating more than one. + */ +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) +{ + int r; + struct kvm_vcpu *vcpu; + + if (!valid_vcpu(n)) + return -EINVAL; + + vcpu = kvm_x86_ops->vcpu_create(kvm, n); + if (IS_ERR(vcpu)) + return PTR_ERR(vcpu); + + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); + + vcpu_load(vcpu); + r = kvm_mmu_setup(vcpu); + vcpu_put(vcpu); + if (r < 0) + goto free_vcpu; + + mutex_lock(&kvm->lock); + if (kvm->vcpus[n]) { + r = -EEXIST; + mutex_unlock(&kvm->lock); + goto mmu_unload; + } + kvm->vcpus[n] = vcpu; + mutex_unlock(&kvm->lock); + + /* Now it's all set up, let userspace reach it */ + r = create_vcpu_fd(vcpu); + if (r < 0) + goto unlink; + return r; + +unlink: + mutex_lock(&kvm->lock); + kvm->vcpus[n] = NULL; + mutex_unlock(&kvm->lock); + +mmu_unload: + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); + +free_vcpu: + kvm_x86_ops->vcpu_free(vcpu); + return r; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + u64 efer; + int i; + struct kvm_cpuid_entry *e, *entry; + + rdmsrl(MSR_EFER, efer); + entry = NULL; + for (i = 0; i < vcpu->cpuid_nent; ++i) { + e = &vcpu->cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out; + vcpu->cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + return 0; + +out: + return r; +} + +static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) +{ + if (sigset) { + sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + vcpu->sigset_active = 1; + vcpu->sigset = *sigset; + } else + vcpu->sigset_active = 0; + return 0; +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(s->regs, vcpu->apic->regs, sizeof *s); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(vcpu->apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + vcpu_put(vcpu); + + return 0; +} + +static long kvm_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r = -EINVAL; + + switch (ioctl) { + case KVM_RUN: + r = -EINVAL; + if (arg) + goto out; + r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); + break; + case KVM_GET_REGS: { + struct kvm_regs kvm_regs; + + memset(&kvm_regs, 0, sizeof kvm_regs); + r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) + goto out; + r = 0; + break; + } + case KVM_SET_REGS: { + struct kvm_regs kvm_regs; + + r = -EFAULT; + if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) + goto out; + r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SREGS: { + struct kvm_sregs kvm_sregs; + + memset(&kvm_sregs, 0, sizeof kvm_sregs); + r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs kvm_sregs; + + r = -EFAULT; + if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) + goto out; + r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r = 0; + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + + r = -EFAULT; + if (copy_from_user(&tr, argp, sizeof tr)) + goto out; + r = kvm_vcpu_ioctl_translate(vcpu, &tr); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tr, sizeof tr)) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_DEBUG_GUEST: { + struct kvm_debug_guest dbg; + + r = -EFAULT; + if (copy_from_user(&dbg, argp, sizeof dbg)) + goto out; + r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_SIGNAL_MASK: { + struct kvm_signal_mask __user *sigmask_arg = argp; + struct kvm_signal_mask kvm_sigmask; + sigset_t sigset, *p; + + p = NULL; + if (argp) { + r = -EFAULT; + if (copy_from_user(&kvm_sigmask, argp, + sizeof kvm_sigmask)) + goto out; + r = -EINVAL; + if (kvm_sigmask.len != sizeof sigset) + goto out; + r = -EFAULT; + if (copy_from_user(&sigset, sigmask_arg->sigset, + sizeof sigset)) + goto out; + p = &sigset; + } + r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); + break; + } + case KVM_GET_FPU: { + struct kvm_fpu fpu; + + memset(&fpu, 0, sizeof fpu); + r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &fpu, sizeof fpu)) + goto out; + r = 0; + break; + } + case KVM_SET_FPU: { + struct kvm_fpu fpu; + + r = -EFAULT; + if (copy_from_user(&fpu, argp, sizeof fpu)) + goto out; + r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_LAPIC: { + struct kvm_lapic_state lapic; + + memset(&lapic, 0, sizeof lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &lapic, sizeof lapic)) + goto out; + r = 0; + break; + } + case KVM_SET_LAPIC: { + struct kvm_lapic_state lapic; + + r = -EFAULT; + if (copy_from_user(&lapic, argp, sizeof lapic)) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + if (r) + goto out; + r = 0; + break; + } + default: + ; + } +out: + return r; +} + +static long kvm_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r = -EINVAL; + + switch (ioctl) { + case KVM_CREATE_VCPU: + r = kvm_vm_ioctl_create_vcpu(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) + goto out; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); + if (r) + goto out; + break; + } + case KVM_GET_DIRTY_LOG: { + struct kvm_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof log)) + goto out; + r = kvm_vm_ioctl_get_dirty_log(kvm, &log); + if (r) + goto out; + break; + } + case KVM_SET_MEMORY_ALIAS: { + struct kvm_memory_alias alias; + + r = -EFAULT; + if (copy_from_user(&alias, argp, sizeof alias)) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + if (r) + goto out; + break; + } + case KVM_CREATE_IRQCHIP: + r = -ENOMEM; + kvm->vpic = kvm_create_pic(kvm); + if (kvm->vpic) { + r = kvm_ioapic_init(kvm); + if (r) { + kfree(kvm->vpic); + kvm->vpic = NULL; + goto out; + } + } + else + goto out; + break; + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + if (irqchip_in_kernel(kvm)) { + mutex_lock(&kvm->lock); + if (irq_event.irq < 16) + kvm_pic_set_irq(pic_irqchip(kvm), + irq_event.irq, + irq_event.level); + kvm_ioapic_set_irq(kvm->vioapic, + irq_event.irq, + irq_event.level); + mutex_unlock(&kvm->lock); + r = 0; + } + break; + } + case KVM_GET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &chip, sizeof chip)) + goto out; + r = 0; + break; + } + case KVM_SET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + if (r) + goto out; + r = 0; + break; + } + default: + ; + } +out: + return r; +} + +static struct page *kvm_vm_nopage(struct vm_area_struct *vma, + unsigned long address, + int *type) +{ + struct kvm *kvm = vma->vm_file->private_data; + unsigned long pgoff; + struct page *page; + + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + page = gfn_to_page(kvm, pgoff); + if (!page) + return NOPAGE_SIGBUS; + get_page(page); + if (type != NULL) + *type = VM_FAULT_MINOR; + + return page; +} + +static struct vm_operations_struct kvm_vm_vm_ops = { + .nopage = kvm_vm_nopage, +}; + +static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_vm_vm_ops; + return 0; +} + +static struct file_operations kvm_vm_fops = { + .release = kvm_vm_release, + .unlocked_ioctl = kvm_vm_ioctl, + .compat_ioctl = kvm_vm_ioctl, + .mmap = kvm_vm_mmap, +}; + +static int kvm_dev_ioctl_create_vm(void) +{ + int fd, r; + struct inode *inode; + struct file *file; + struct kvm *kvm; + + kvm = kvm_create_vm(); + if (IS_ERR(kvm)) + return PTR_ERR(kvm); + r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); + if (r) { + kvm_destroy_vm(kvm); + return r; + } + + kvm->filp = file; + + return fd; +} + +static long kvm_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r = -EINVAL; + + switch (ioctl) { + case KVM_GET_API_VERSION: + r = -EINVAL; + if (arg) + goto out; + r = KVM_API_VERSION; + break; + case KVM_CREATE_VM: + r = -EINVAL; + if (arg) + goto out; + r = kvm_dev_ioctl_create_vm(); + break; + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < num_msrs_to_save) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + + num_msrs_to_save * sizeof(u32), + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r = 0; + break; + } + case KVM_CHECK_EXTENSION: { + int ext = (long)argp; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + r = 1; + break; + default: + r = 0; + break; + } + break; + } + case KVM_GET_VCPU_MMAP_SIZE: + r = -EINVAL; + if (arg) + goto out; + r = 2 * PAGE_SIZE; + break; + default: + ; + } +out: + return r; +} + +static struct file_operations kvm_chardev_ops = { + .unlocked_ioctl = kvm_dev_ioctl, + .compat_ioctl = kvm_dev_ioctl, +}; + +static struct miscdevice kvm_dev = { + KVM_MINOR, + "kvm", + &kvm_chardev_ops, +}; + +/* + * Make sure that a cpu that is being hot-unplugged does not have any vcpus + * cached on it. + */ +static void decache_vcpus_on_cpu(int cpu) +{ + struct kvm *vm; + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(vm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = vm->vcpus[i]; + if (!vcpu) + continue; + /* + * If the vcpu is locked, then it is running on some + * other cpu and therefore it is not cached on the + * cpu in question. + * + * If it's not locked, check the last cpu it executed + * on. + */ + if (mutex_trylock(&vcpu->mutex)) { + if (vcpu->cpu == cpu) { + kvm_x86_ops->vcpu_decache(vcpu); + vcpu->cpu = -1; + } + mutex_unlock(&vcpu->mutex); + } + } + spin_unlock(&kvm_lock); +} + +static void hardware_enable(void *junk) +{ + int cpu = raw_smp_processor_id(); - return 0; + if (cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_set(cpu, cpus_hardware_enabled); + kvm_x86_ops->hardware_enable(NULL); } -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg) +static void hardware_disable(void *junk) { - int r; + int cpu = raw_smp_processor_id(); - vcpu_load(vcpu); + if (!cpu_isset(cpu, cpus_hardware_enabled)) + return; + cpu_clear(cpu, cpus_hardware_enabled); + decache_vcpus_on_cpu(cpu); + kvm_x86_ops->hardware_disable(NULL); +} - r = kvm_x86_ops->set_guest_debug(vcpu, dbg); +static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, + void *v) +{ + int cpu = (long)v; - vcpu_put(vcpu); + switch (val) { + case CPU_DYING: + case CPU_DYING_FROZEN: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + hardware_disable(NULL); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); + smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); + break; + } + return NOTIFY_OK; +} - return r; +static int kvm_reboot(struct notifier_block *notifier, unsigned long val, + void *v) +{ + if (val == SYS_RESTART) { + /* + * Some (well, at least mine) BIOSes hang on reboot if + * in vmx root mode. + */ + printk(KERN_INFO "kvm: exiting hardware virtualization\n"); + on_each_cpu(hardware_disable, NULL, 0, 1); + } + return NOTIFY_OK; } -/* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif +static struct notifier_block kvm_reboot_notifier = { + .notifier_call = kvm_reboot, + .priority = 0, }; -/* - * Translate a guest virtual address to a guest physical address. - */ -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) +void kvm_io_bus_init(struct kvm_io_bus *bus) { - unsigned long vaddr = tr->linear_address; - gpa_t gpa; - - vcpu_load(vcpu); - down_read(¤t->mm->mmap_sem); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(¤t->mm->mmap_sem); - tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; - tr->writeable = 1; - tr->usermode = 0; - vcpu_put(vcpu); - - return 0; + memset(bus, 0, sizeof(*bus)); } -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +void kvm_io_bus_destroy(struct kvm_io_bus *bus) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); - - memcpy(fpu->fpr, fxsave->st_space, 128); - fpu->fcw = fxsave->cwd; - fpu->fsw = fxsave->swd; - fpu->ftwx = fxsave->twd; - fpu->last_opcode = fxsave->fop; - fpu->last_ip = fxsave->rip; - fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + int i; - vcpu_put(vcpu); + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; - return 0; + kvm_iodevice_destructor(pos); + } } -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); + int i; - memcpy(fxsave->st_space, fpu->fpr, 128); - fxsave->cwd = fpu->fcw; - fxsave->swd = fpu->fsw; - fxsave->twd = fpu->ftwx; - fxsave->fop = fpu->last_opcode; - fxsave->rip = fpu->last_ip; - fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + for (i = 0; i < bus->dev_count; i++) { + struct kvm_io_device *pos = bus->devs[i]; - vcpu_put(vcpu); + if (pos->in_range(pos, addr)) + return pos; + } - return 0; + return NULL; } -void fx_init(struct kvm_vcpu *vcpu) +void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) { - unsigned after_mxcsr_mask; - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - fx_save(&vcpu->arch.host_fx_image); - fpu_init(); - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); + BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); - vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); + bus->devs[bus->dev_count++] = dev; } -EXPORT_SYMBOL_GPL(fx_init); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_hotplug, + .priority = 20, /* must be > scheduler priority */ +}; + +static u64 stat_get(void *_offset) { - if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) - return; + unsigned offset = (long)_offset; + u64 total = 0; + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; - vcpu->guest_fpu_loaded = 1; - fx_save(&vcpu->arch.host_fx_image); - fx_restore(&vcpu->arch.guest_fx_image); + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + total += *(u32 *)((void *)vcpu + offset); + } + spin_unlock(&kvm_lock); + return total; } -EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); + +static __init void kvm_init_debug(void) { - if (!vcpu->guest_fpu_loaded) - return; + struct kvm_stats_debugfs_item *p; - vcpu->guest_fpu_loaded = 0; - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); - ++vcpu->stat.fpu_reload; + debugfs_dir = debugfs_create_dir("kvm", NULL); + for (p = debugfs_entries; p->name; ++p) + p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, + (void *)(long)p->offset, + &stat_fops); } -EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +static void kvm_exit_debug(void) { - kvm_x86_ops->vcpu_free(vcpu); + struct kvm_stats_debugfs_item *p; + + for (p = debugfs_entries; p->name; ++p) + debugfs_remove(p->dentry); + debugfs_remove(debugfs_dir); } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +static int kvm_suspend(struct sys_device *dev, pm_message_t state) { - return kvm_x86_ops->vcpu_create(kvm, id); + hardware_disable(NULL); + return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +static int kvm_resume(struct sys_device *dev) { - int r; - - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - - vcpu_load(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r == 0) - r = kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - if (r < 0) - goto free_vcpu; - + hardware_enable(NULL); return 0; -free_vcpu: - kvm_x86_ops->vcpu_free(vcpu); - return r; } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); +static struct sysdev_class kvm_sysdev_class = { + .name = "kvm", + .suspend = kvm_suspend, + .resume = kvm_resume, +}; - kvm_x86_ops->vcpu_free(vcpu); -} +static struct sys_device kvm_sysdev = { + .id = 0, + .cls = &kvm_sysdev_class, +}; -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->vcpu_reset(vcpu); -} +hpa_t bad_page_address; -void kvm_arch_hardware_enable(void *garbage) +static inline +struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { - kvm_x86_ops->hardware_enable(garbage); + return container_of(pn, struct kvm_vcpu, preempt_notifier); } -void kvm_arch_hardware_disable(void *garbage) +static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { - kvm_x86_ops->hardware_disable(garbage); -} + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); -int kvm_arch_hardware_setup(void) -{ - return kvm_x86_ops->hardware_setup(); + kvm_x86_ops->vcpu_load(vcpu, cpu); } -void kvm_arch_hardware_unsetup(void) +static void kvm_sched_out(struct preempt_notifier *pn, + struct task_struct *next) { - kvm_x86_ops->hardware_unsetup(); -} + struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); -void kvm_arch_check_processor_compat(void *rtn) -{ - kvm_x86_ops->check_processor_compatibility(rtn); + kvm_x86_ops->vcpu_put(vcpu); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module) { - struct page *page; - struct kvm *kvm; int r; + int cpu; - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); + return -EEXIST; + } - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; + if (!ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + return -EOPNOTSUPP; + } + if (ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + return -EOPNOTSUPP; } - vcpu->arch.pio_data = page_address(page); - r = kvm_mmu_create(vcpu); + kvm_x86_ops = ops; + + r = kvm_x86_ops->hardware_setup(); if (r < 0) - goto fail_free_pio_data; + goto out; - if (irqchip_in_kernel(kvm)) { - r = kvm_create_lapic(vcpu); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_x86_ops->check_processor_compatibility, + &r, 0, 1); if (r < 0) - goto fail_mmu_destroy; + goto out_free_0; } - return 0; - -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: - return r; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - kvm_free_lapic(vcpu); - kvm_mmu_destroy(vcpu); - free_page((unsigned long)vcpu->arch.pio_data); -} + on_each_cpu(hardware_enable, NULL, 0, 1); + r = register_cpu_notifier(&kvm_cpu_notifier); + if (r) + goto out_free_1; + register_reboot_notifier(&kvm_reboot_notifier); -struct kvm *kvm_arch_create_vm(void) -{ - struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + r = sysdev_class_register(&kvm_sysdev_class); + if (r) + goto out_free_2; - if (!kvm) - return ERR_PTR(-ENOMEM); + r = sysdev_register(&kvm_sysdev); + if (r) + goto out_free_3; - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + /* A kmem cache lets us meet the alignment requirements of fx_save. */ + kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, + __alignof__(struct kvm_vcpu), 0, 0); + if (!kvm_vcpu_cache) { + r = -ENOMEM; + goto out_free_4; + } - return kvm; -} + kvm_chardev_ops.owner = module; -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} + r = misc_register(&kvm_dev); + if (r) { + printk (KERN_ERR "kvm: misc device register failed\n"); + goto out_free; + } -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; + kvm_preempt_ops.sched_in = kvm_sched_in; + kvm_preempt_ops.sched_out = kvm_sched_out; - /* - * Unpin any mmu pages first. - */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + return r; +out_free: + kmem_cache_destroy(kvm_vcpu_cache); +out_free_4: + sysdev_unregister(&kvm_sysdev); +out_free_3: + sysdev_class_unregister(&kvm_sysdev_class); +out_free_2: + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); +out_free_1: + on_each_cpu(hardware_disable, NULL, 0, 1); +out_free_0: + kvm_x86_ops->hardware_unsetup(); +out: + kvm_x86_ops = NULL; + return r; } -void kvm_arch_destroy_vm(struct kvm *kvm) +void kvm_exit_x86(void) { - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); - kfree(kvm); + misc_deregister(&kvm_dev); + kmem_cache_destroy(kvm_vcpu_cache); + sysdev_unregister(&kvm_sysdev); + sysdev_class_unregister(&kvm_sysdev_class); + unregister_reboot_notifier(&kvm_reboot_notifier); + unregister_cpu_notifier(&kvm_cpu_notifier); + on_each_cpu(hardware_disable, NULL, 0, 1); + kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops = NULL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +static __init int kvm_init(void) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + static struct page *bad_page; + int r; - /*To keep backward compatibility with older userspace, - *x86 needs to hanlde !user_alloc case. - */ - if (!user_alloc) { - if (npages && !old.rmap) { - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); - - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - } - } + r = kvm_mmu_module_init(); + if (r) + goto out4; + + kvm_init_debug(); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + kvm_init_msr_list(); + + if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { + r = -ENOMEM; + goto out; } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - kvm_flush_remote_tlbs(kvm); + bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; + memset(__va(bad_page_address), 0, PAGE_SIZE); return 0; -} -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; +out: + kvm_exit_debug(); + kvm_mmu_module_exit(); +out4: + return r; } -static void vcpu_kick_intr(void *info) +static __exit void kvm_exit(void) { -#ifdef DEBUG - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; - printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); -#endif + kvm_exit_debug(); + __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); + kvm_mmu_module_exit(); } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int ipi_pcpu = vcpu->cpu; +module_init(kvm_init) +module_exit(kvm_exit) - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - if (vcpu->guest_mode) - smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); -} +EXPORT_SYMBOL_GPL(kvm_init_x86); +EXPORT_SYMBOL_GPL(kvm_exit_x86); diff --git a/trunk/arch/x86/kvm/kvm_svm.h b/trunk/drivers/kvm/kvm_svm.h similarity index 96% rename from trunk/arch/x86/kvm/kvm_svm.h rename to trunk/drivers/kvm/kvm_svm.h index ecdfe97e4635..a0e415daef5b 100644 --- a/trunk/arch/x86/kvm/kvm_svm.h +++ b/trunk/drivers/kvm/kvm_svm.h @@ -4,10 +4,10 @@ #include #include #include -#include #include #include "svm.h" +#include "kvm.h" static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 diff --git a/trunk/arch/x86/kvm/lapic.c b/trunk/drivers/kvm/lapic.c similarity index 83% rename from trunk/arch/x86/kvm/lapic.c rename to trunk/drivers/kvm/lapic.c index 2cbee9479ce4..238fcad3cece 100644 --- a/trunk/arch/x86/kvm/lapic.c +++ b/trunk/drivers/kvm/lapic.c @@ -17,7 +17,7 @@ * the COPYING file in the top-level directory. */ -#include +#include "kvm.h" #include #include #include @@ -56,7 +56,6 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) - static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -89,7 +88,7 @@ static inline void apic_clear_vector(int vec, void *bitmap) static inline int apic_hw_enabled(struct kvm_lapic *apic) { - return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; + return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; } static inline int apic_sw_enabled(struct kvm_lapic *apic) @@ -173,7 +172,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; int highest_irr; if (!apic) @@ -184,10 +183,8 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) { - struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic_test_and_set_irr(vec, apic)) { /* a new pending irq is set in IRR */ if (trig) @@ -271,7 +268,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode) { int result = 0; - struct kvm_lapic *target = vcpu->arch.apic; + struct kvm_lapic *target = vcpu->apic; apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x", @@ -338,10 +335,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { + vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -362,11 +359,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); - vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; + vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { printk(KERN_DEBUG @@ -379,9 +376,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; + if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { + vcpu->sipi_vector = vector; + vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -395,14 +392,15 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } -static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, +struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, unsigned long bitmap) { + int vcpu_id; int last; int next; - struct kvm_lapic *apic = NULL; + struct kvm_lapic *apic; - last = kvm->arch.round_robin_prev_vcpu; + last = kvm->round_robin_prev_vcpu; next = last; do { @@ -410,30 +408,25 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, next = 0; if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) continue; - apic = kvm->vcpus[next]->arch.apic; + apic = kvm->vcpus[next]->apic; if (apic && apic_enabled(apic)) break; apic = NULL; } while (next != last); - kvm->arch.round_robin_prev_vcpu = next; + kvm->round_robin_prev_vcpu = next; - if (!apic) - printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); + if (!apic) { + vcpu_id = ffs(bitmap) - 1; + if (vcpu_id < 0) { + vcpu_id = 0; + printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); + } + apic = kvm->vcpus[vcpu_id]->apic; + } return apic; } -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) -{ - struct kvm_lapic *apic; - - apic = kvm_apic_round_robin(kvm, vector, bitmap); - if (apic) - return apic->vcpu; - return NULL; -} - static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); @@ -465,7 +458,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) unsigned int delivery_mode = icr_low & APIC_MODE_MASK; unsigned int vector = icr_low & APIC_VECTOR_MASK; - struct kvm_vcpu *target; + struct kvm_lapic *target; struct kvm_vcpu *vcpu; unsigned long lpr_map = 0; int i; @@ -481,20 +474,20 @@ static void apic_send_ipi(struct kvm_lapic *apic) if (!vcpu) continue; - if (vcpu->arch.apic && + if (vcpu->apic && apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { if (delivery_mode == APIC_DM_LOWEST) set_bit(vcpu->vcpu_id, &lpr_map); else - __apic_accept_irq(vcpu->arch.apic, delivery_mode, + __apic_accept_irq(vcpu->apic, delivery_mode, vector, level, trig_mode); } } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); + target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); if (target != NULL) - __apic_accept_irq(target->arch.apic, delivery_mode, + __apic_accept_irq(target, delivery_mode, vector, level, trig_mode); } } @@ -551,23 +544,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) return tmcct; } -static void __report_tpr_access(struct kvm_lapic *apic, bool write) -{ - struct kvm_vcpu *vcpu = apic->vcpu; - struct kvm_run *run = vcpu->run; - - set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); - kvm_x86_ops->cache_regs(vcpu); - run->tpr_access.rip = vcpu->arch.rip; - run->tpr_access.is_write = write; -} - -static inline void report_tpr_access(struct kvm_lapic *apic, bool write) -{ - if (apic->vcpu->arch.tpr_access_reporting) - __report_tpr_access(apic, write); -} - static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; @@ -585,9 +561,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = apic_get_tmcct(apic); break; - case APIC_TASKPRI: - report_tpr_access(apic, false); - /* fall thru */ default: apic_update_ppr(apic); val = apic_get_reg(apic, offset); @@ -697,7 +670,6 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_TASKPRI: - report_tpr_access(apic, true); apic_set_tpr(apic, val & 0xff); break; @@ -790,17 +762,19 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) return ret; } -void kvm_free_lapic(struct kvm_vcpu *vcpu) +void kvm_free_apic(struct kvm_lapic *apic) { - if (!vcpu->arch.apic) + if (!apic) return; - hrtimer_cancel(&vcpu->arch.apic->timer.dev); + hrtimer_cancel(&apic->timer.dev); - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); + if (apic->regs_page) { + __free_page(apic->regs_page); + apic->regs_page = 0; + } - kfree(vcpu->arch.apic); + kfree(apic); } /* @@ -811,17 +785,16 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; if (!apic) return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (apic_get_reg(apic, APIC_TASKPRI) & 4)); + apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; u64 tpr; if (!apic) @@ -834,29 +807,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; if (!apic) { value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; + vcpu->apic_base = value; return; } if (apic->vcpu->vcpu_id) value &= ~MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - apic->base_address = apic->vcpu->arch.apic_base & + vcpu->apic_base = value; + apic->base_address = apic->vcpu->apic_base & MSR_IA32_APICBASE_BASE; /* with FSB delivery interrupt, we can restart APIC functionality */ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); + "0x%lx.\n", apic->apic_base, apic->base_address); } u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic_base; + return vcpu->apic_base; } EXPORT_SYMBOL_GPL(kvm_lapic_get_base); @@ -868,7 +841,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_debug("%s\n", __FUNCTION__); ASSERT(vcpu); - apic = vcpu->arch.apic; + apic = vcpu->apic; ASSERT(apic != NULL); /* Stop the timer in case it's a reset to an active apic */ @@ -899,19 +872,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) update_divide_count(apic); atomic_set(&apic->timer.pending, 0); if (vcpu->vcpu_id == 0) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, vcpu, kvm_apic_id(apic), - vcpu->arch.apic_base, apic->base_address); + vcpu->apic_base, apic->base_address); } EXPORT_SYMBOL_GPL(kvm_lapic_reset); int kvm_lapic_enabled(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; int ret = 0; if (!apic) @@ -935,8 +908,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + if (waitqueue_active(q)) + { + apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; wake_up_interruptible(q); } if (apic_lvtt_period(apic)) { @@ -982,13 +956,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) if (!apic) goto nomem; - vcpu->arch.apic = apic; + vcpu->apic = apic; apic->regs_page = alloc_page(GFP_KERNEL); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); - goto nomem_free_apic; + goto nomem; } apic->regs = page_address(apic->regs_page); memset(apic->regs, 0, PAGE_SIZE); @@ -997,7 +971,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); apic->timer.dev.function = apic_timer_fn; apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; + vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; kvm_lapic_reset(vcpu); apic->dev.read = apic_mmio_read; @@ -1006,16 +980,15 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->dev.private = apic; return 0; -nomem_free_apic: - kfree(apic); nomem: + kvm_free_apic(apic); return -ENOMEM; } EXPORT_SYMBOL_GPL(kvm_create_lapic); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; int highest_irr; if (!apic || !apic_enabled(apic)) @@ -1031,11 +1004,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { - u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); + u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); int r = 0; if (vcpu->vcpu_id == 0) { - if (!apic_hw_enabled(vcpu->arch.apic)) + if (!apic_hw_enabled(vcpu->apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) @@ -1046,7 +1019,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; if (apic && apic_lvt_enabled(apic, APIC_LVTT) && atomic_read(&apic->timer.pending) > 0) { @@ -1057,7 +1030,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) apic->timer.last_update = ktime_add_ns( @@ -1068,7 +1041,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; if (vector == -1) return -1; @@ -1081,9 +1054,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; - apic->base_address = vcpu->arch.apic_base & + apic->base_address = vcpu->apic_base & MSR_IA32_APICBASE_BASE; apic_set_reg(apic, APIC_LVR, APIC_VERSION); apic_update_ppr(apic); @@ -1092,9 +1065,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) start_apic_timer(apic); } -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_lapic *apic = vcpu->apic; struct hrtimer *timer; if (!apic) @@ -1104,51 +1077,4 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) if (hrtimer_cancel(timer)) hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); } - -void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) -{ - u32 data; - void *vapic; - - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) - return; - - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic, KM_USER0); - - apic_set_tpr(vcpu->arch.apic, data & 0xff); -} - -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) -{ - u32 data, tpr; - int max_irr, max_isr; - struct kvm_lapic *apic; - void *vapic; - - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) - return; - - apic = vcpu->arch.apic; - tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; - max_irr = apic_find_highest_irr(apic); - if (max_irr < 0) - max_irr = 0; - max_isr = apic_find_highest_isr(apic); - if (max_isr < 0) - max_isr = 0; - data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic, KM_USER0); -} - -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) -{ - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - vcpu->arch.apic->vapic_addr = vapic_addr; -} +EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); diff --git a/trunk/drivers/kvm/mmu.c b/trunk/drivers/kvm/mmu.c new file mode 100644 index 000000000000..feb5ac986c5d --- /dev/null +++ b/trunk/drivers/kvm/mmu.c @@ -0,0 +1,1498 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "vmx.h" +#include "kvm.h" + +#include +#include +#include +#include +#include + +#include +#include + +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif + +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } +#endif + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_MASK (1ULL << 63) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_FETCH_MASK (1U << 4) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define RMAP_EXT 4 + +struct kvm_rmap_desc { + u64 *shadow_ptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_header_cache; + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->cr0 & X86_CR0_WP; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->shadow_efer & EFER_NX; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static int is_rmap_pte(u64 pte) +{ + return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) + == (PT_WRITABLE_MASK | PT_PRESENT_MASK); +} + +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, + int min) +{ + struct page *page; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + set_page_private(page, 0); + cache->objects[cache->nobjs++] = page_address(page); + } + return 0; +} + +static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + kvm_mmu_free_some_pages(vcpu); + r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, + rmap_desc_cache, 1); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); + mmu_free_memory_cache_page(&vcpu->mmu_page_cache); + mmu_free_memory_cache(&vcpu->mmu_page_header_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +{ + kfree(pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +{ + kfree(rd); +} + +/* + * Reverse mapping data structures: + * + * If page->private bit zero is zero, then page->private points to the + * shadow page table entry that points to page_address(page). + * + * If page->private bit zero is one, (then page->private & ~1) points + * to a struct kvm_rmap_desc containing more mappings. + */ +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) +{ + struct page *page; + struct kvm_rmap_desc *desc; + int i; + + if (!is_rmap_pte(*spte)) + return; + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + if (!page_private(page)) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + set_page_private(page,(unsigned long)spte); + } else if (!(page_private(page) & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->shadow_ptes[0] = (u64 *)page_private(page); + desc->shadow_ptes[1] = spte; + set_page_private(page,(unsigned long)desc | 1); + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + desc = desc->more; + if (desc->shadow_ptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->shadow_ptes[i]; ++i) + ; + desc->shadow_ptes[i] = spte; + } +} + +static void rmap_desc_remove_entry(struct page *page, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + ; + desc->shadow_ptes[i] = desc->shadow_ptes[j]; + desc->shadow_ptes[j] = NULL; + if (j != 0) + return; + if (!prev_desc && !desc->more) + set_page_private(page,(unsigned long)desc->shadow_ptes[0]); + else + if (prev_desc) + prev_desc->more = desc->more; + else + set_page_private(page,(unsigned long)desc->more | 1); + mmu_free_rmap_desc(desc); +} + +static void rmap_remove(u64 *spte) +{ + struct page *page; + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + int i; + + if (!is_rmap_pte(*spte)) + return; + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + if (!page_private(page)) { + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + BUG(); + } else if (!(page_private(page) & 1)) { + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + if ((u64 *)page_private(page) != spte) { + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); + BUG(); + } + set_page_private(page,0); + } else { + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) + if (desc->shadow_ptes[i] == spte) { + rmap_desc_remove_entry(page, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + BUG(); + } +} + +static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +{ + struct kvm *kvm = vcpu->kvm; + struct page *page; + struct kvm_rmap_desc *desc; + u64 *spte; + + page = gfn_to_page(kvm, gfn); + BUG_ON(!page); + + while (page_private(page)) { + if (!(page_private(page) & 1)) + spte = (u64 *)page_private(page); + else { + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); + spte = desc->shadow_ptes[0]; + } + BUG_ON(!spte); + BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT + != page_to_pfn(page)); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON(!(*spte & PT_WRITABLE_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + rmap_remove(spte); + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + +#ifdef MMU_DEBUG +static int is_empty_shadow_page(u64 *spt) +{ + u64 *pos; + u64 *end; + + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + if (*pos != 0) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} +#endif + +static void kvm_mmu_free_page(struct kvm *kvm, + struct kvm_mmu_page *page_head) +{ + ASSERT(is_empty_shadow_page(page_head->spt)); + list_del(&page_head->link); + __free_page(virt_to_page(page_head->spt)); + kfree(page_head); + ++kvm->n_free_mmu_pages; +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte) +{ + struct kvm_mmu_page *page; + + if (!vcpu->kvm->n_free_mmu_pages) + return NULL; + + page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, + sizeof *page); + page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(page->spt), (unsigned long)page); + list_add(&page->link, &vcpu->kvm->active_mmu_pages); + ASSERT(is_empty_shadow_page(page->spt)); + page->slot_bitmap = 0; + page->multimapped = 0; + page->parent_pte = parent_pte; + --vcpu->kvm->n_free_mmu_pages; + return page; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!page->multimapped) { + u64 *old = page->parent_pte; + + if (!old) { + page->parent_pte = parent_pte; + return; + } + page->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&page->parent_ptes); + hlist_add_head(&pte_chain->link, &page->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &page->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!page->multimapped) { + BUG_ON(page->parent_pte != parent_pte); + page->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(pte_chain); + if (hlist_empty(&page->parent_ptes)) { + page->multimapped = 0; + page->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, + gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry(page, node, bucket, hash_link) + if (page->gfn == gfn && !page->role.metaphysical) { + pgprintk("%s: found role %x\n", + __FUNCTION__, page->role.word); + return page; + } + return NULL; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int metaphysical, + unsigned hugepage_access, + u64 *parent_pte) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node; + + role.word = 0; + role.glevels = vcpu->mmu.root_level; + role.level = level; + role.metaphysical = metaphysical; + role.hugepage_access = hugepage_access; + if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + gfn, role.word); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry(page, node, bucket, hash_link) + if (page->gfn == gfn && page->role.word == role.word) { + mmu_page_add_parent_pte(vcpu, page, parent_pte); + pgprintk("%s: found\n", __FUNCTION__); + return page; + } + page = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!page) + return page; + pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + page->gfn = gfn; + page->role = role; + hlist_add_head(&page->hash_link, bucket); + if (!metaphysical) + rmap_write_protect(vcpu, gfn); + return page; +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *page) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = page->spt; + + if (page->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] & PT_PRESENT_MASK) + rmap_remove(&pt[i]); + pt[i] = 0; + } + kvm_flush_remote_tlbs(kvm); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = 0; + if (!(ent & PT_PRESENT_MASK)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), &pt[i]); + } + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_put_page(struct kvm_mmu_page *page, + u64 *parent_pte) +{ + mmu_page_remove_parent_pte(page, parent_pte); +} + +static void kvm_mmu_zap_page(struct kvm *kvm, + struct kvm_mmu_page *page) +{ + u64 *parent_pte; + + while (page->multimapped || page->parent_pte) { + if (!page->multimapped) + parent_pte = page->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(page->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(page, parent_pte); + set_shadow_pte(parent_pte, 0); + } + kvm_mmu_page_unlink_children(kvm, page); + if (!page->root_count) { + hlist_del(&page->hash_link); + kvm_mmu_free_page(kvm, page); + } else + list_move(&page->link, &kvm->active_mmu_pages); +} + +static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *page; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) + if (page->gfn == gfn && !page->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + page->role.word); + kvm_mmu_zap_page(vcpu->kvm, page); + r = 1; + } + return r; +} + +static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *page; + + while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", + __FUNCTION__, gfn, page->role.word); + kvm_mmu_zap_page(vcpu->kvm, page); + } +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); + struct kvm_mmu_page *page_head = page_header(__pa(pte)); + + __set_bit(slot, &page_head->slot_bitmap); +} + +hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; +} + +hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct page *page; + + ASSERT((gpa & HPA_ERR_MASK) == 0); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) + return gpa | HPA_ERR_MASK; + return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) + | (gpa & (PAGE_SIZE-1)); +} + +hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return UNMAPPED_GVA; + return gpa_to_hpa(vcpu, gpa); +} + +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->mmu.root_hpa; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + u64 pte; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + pte = table[index]; + if (is_present_pte(pte) && is_writeble_pte(pte)) + return 0; + mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); + page_header_update_slot(vcpu->kvm, table, v); + table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | + PT_USER_MASK; + rmap_add(vcpu, &table[index]); + return 0; + } + + if (table[index] == 0) { + struct kvm_mmu_page *new_table; + gfn_t pseudo_gfn; + + pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, + v, level - 1, + 1, 0, &table[index]); + if (!new_table) { + pgprintk("nonpaging_map: ENOMEM\n"); + return -ENOMEM; + } + + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *page; + + if (!VALID_PAGE(vcpu->mmu.root_hpa)) + return; +#ifdef CONFIG_X86_64 + if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->mmu.root_hpa; + + page = page_header(root); + --page->root_count; + vcpu->mmu.root_hpa = INVALID_PAGE; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; + } + vcpu->mmu.pae_root[i] = INVALID_PAGE; + } + vcpu->mmu.root_hpa = INVALID_PAGE; +} + +static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *page; + + root_gfn = vcpu->cr3 >> PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + page = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, 0, NULL); + root = __pa(page->spt); + ++page->root_count; + vcpu->mmu.root_hpa = root; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { + if (!is_present_pte(vcpu->pdptrs[i])) { + vcpu->mmu.pae_root[i] = 0; + continue; + } + root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; + } else if (vcpu->mmu.root_level == 0) + root_gfn = 0; + page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, !is_paging(vcpu), + 0, NULL); + root = __pa(page->spt); + ++page->root_count; + vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + gpa_t addr = gva; + hpa_t paddr; + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); + + + paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); + + if (is_error_hpa(paddr)) + return 1; + + return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->mmu.root_hpa)) { + vcpu->mmu.free(vcpu); + vcpu->mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + mutex_lock(&vcpu->kvm->lock); + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + mmu_alloc_roots(vcpu); + kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); +out: + mutex_unlock(&vcpu->kvm->lock); + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_present_pte(pte)) { + if (page->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + set_shadow_pte(spte, 0); + kvm_flush_remote_tlbs(vcpu->kvm); +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte, + const void *new, int bytes) +{ + if (page->role.level != PT_PAGE_TABLE_LEVEL) + return; + + if (page->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, page, spte, new, bytes); + else + paging64_update_pte(vcpu, page, spte, new, bytes); +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *page; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 *spte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + unsigned quadrant; + int level; + int flooded = 0; + int npte; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + if (gfn == vcpu->last_pt_write_gfn) { + ++vcpu->last_pt_write_count; + if (vcpu->last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->last_pt_write_gfn = gfn; + vcpu->last_pt_write_count = 1; + } + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->mmu_page_hash[index]; + hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { + if (page->gfn != gfn || page->role.metaphysical) + continue; + pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, page->role.word); + kvm_mmu_zap_page(vcpu->kvm, page); + continue; + } + page_offset = offset; + level = page->role.level; + npte = 1; + if (page->role.glevels == PT32_ROOT_LEVEL) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != page->role.quadrant) + continue; + } + spte = &page->spt[page_offset / sizeof(*spte)]; + while (npte--) { + mmu_pte_write_zap_pte(vcpu, page, spte); + mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); + ++spte; + } + } +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); + + return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); +} + +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { + struct kvm_mmu_page *page; + + page = container_of(vcpu->kvm->active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, page); + } +} + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + while (!list_empty(&vcpu->kvm->active_mmu_pages)) { + page = container_of(vcpu->kvm->active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, page); + } + free_page((unsigned long)vcpu->mmu.pae_root); +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct page *page; + int i; + + ASSERT(vcpu); + + vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; + + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + vcpu->mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->mmu.pae_root[i] = INVALID_PAGE; + + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_create(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + + return alloc_mmu_pages(vcpu); +} + +int kvm_mmu_setup(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); + + return init_kvm_mmu(vcpu); +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &kvm->active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &page->slot_bitmap)) + continue; + + pt = page->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) { + rmap_remove(&pt[i]); + pt[i] &= ~PT_WRITABLE_MASK; + } + } +} + +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *page, *node; + + list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link) + kvm_mmu_zap_page(kvm, page); + + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_module_exit(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL); + if (!rmap_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL); + if (!mmu_page_header_cache) + goto nomem; + + return 0; + +nomem: + kvm_mmu_module_exit(); + return -ENOMEM; +} + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + + va = canonicalize(va); + if (level > 1) + audit_mappings_page(vcpu, ent, va, level - 1); + else { + gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + if ((ent & PT_PRESENT_MASK) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx\n", + audit_msg, vcpu->mmu.root_level, + va, gpa, hpa, ent); + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + struct page *page = m->phys_mem[j]; + + if (!page->private) + continue; + if (!(page->private & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(page->private & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *page; + int i; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + u64 *pt = page->spt; + + if (page->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *page; + + list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { + hfn_t hfn; + struct page *pg; + + if (page->role.metaphysical) + continue; + + hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) + >> PAGE_SHIFT; + pg = pfn_to_page(hfn); + if (pg->private) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, page->gfn, + page->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif diff --git a/trunk/drivers/kvm/paging_tmpl.h b/trunk/drivers/kvm/paging_tmpl.h new file mode 100644 index 000000000000..6b094b44f8fb --- /dev/null +++ b/trunk/drivers/kvm/paging_tmpl.h @@ -0,0 +1,511 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #else + #define PT_MAX_FULL_LEVELS 2 + #endif +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_MAX_FULL_LEVELS 2 +#else + #error Invalid PTTYPE value +#endif + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + gfn_t table_gfn[PT_MAX_FULL_LEVELS]; + pt_element_t *table; + pt_element_t pte; + pt_element_t *ptep; + struct page *page; + int index; + pt_element_t inherited_ar; + gfn_t gfn; + u32 error_code; +}; + +/* + * Fetch a guest pte for a guest virtual address + */ +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, int fetch_fault) +{ + hpa_t hpa; + struct kvm_memory_slot *slot; + pt_element_t *ptep; + pt_element_t root; + gfn_t table_gfn; + + pgprintk("%s: addr %lx\n", __FUNCTION__, addr); + walker->level = vcpu->mmu.root_level; + walker->table = NULL; + walker->page = NULL; + walker->ptep = NULL; + root = vcpu->cr3; +#if PTTYPE == 64 + if (!is_long_mode(vcpu)) { + walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; + root = *walker->ptep; + walker->pte = root; + if (!(root & PT_PRESENT_MASK)) + goto not_present; + --walker->level; + } +#endif + table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + walker->table_gfn[walker->level - 1] = table_gfn; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + slot = gfn_to_memslot(vcpu->kvm, table_gfn); + hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); + walker->page = pfn_to_page(hpa >> PAGE_SHIFT); + walker->table = kmap_atomic(walker->page, KM_USER0); + + ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || + (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + + walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; + + for (;;) { + int index = PT_INDEX(addr, walker->level); + hpa_t paddr; + + ptep = &walker->table[index]; + walker->index = index; + ASSERT(((unsigned long)walker->table & PAGE_MASK) == + ((unsigned long)ptep & PAGE_MASK)); + + if (!is_present_pte(*ptep)) + goto not_present; + + if (write_fault && !is_writeble_pte(*ptep)) + if (user_fault || is_write_protection(vcpu)) + goto access_error; + + if (user_fault && !(*ptep & PT_USER_MASK)) + goto access_error; + +#if PTTYPE == 64 + if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) + goto access_error; +#endif + + if (!(*ptep & PT_ACCESSED_MASK)) { + mark_page_dirty(vcpu->kvm, table_gfn); + *ptep |= PT_ACCESSED_MASK; + } + + if (walker->level == PT_PAGE_TABLE_LEVEL) { + walker->gfn = (*ptep & PT_BASE_ADDR_MASK) + >> PAGE_SHIFT; + break; + } + + if (walker->level == PT_DIRECTORY_LEVEL + && (*ptep & PT_PAGE_SIZE_MASK) + && (PTTYPE == 64 || is_pse(vcpu))) { + walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); + break; + } + + walker->inherited_ar &= walker->table[index]; + table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; + kunmap_atomic(walker->table, KM_USER0); + paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); + walker->page = pfn_to_page(paddr >> PAGE_SHIFT); + walker->table = kmap_atomic(walker->page, KM_USER0); + --walker->level; + walker->table_gfn[walker->level - 1 ] = table_gfn; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + } + walker->pte = *ptep; + if (walker->page) + walker->ptep = NULL; + if (walker->table) + kunmap_atomic(walker->table, KM_USER0); + pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); + return 1; + +not_present: + walker->error_code = 0; + goto err; + +access_error: + walker->error_code = PFERR_PRESENT_MASK; + +err: + if (write_fault) + walker->error_code |= PFERR_WRITE_MASK; + if (user_fault) + walker->error_code |= PFERR_USER_MASK; + if (fetch_fault) + walker->error_code |= PFERR_FETCH_MASK; + if (walker->table) + kunmap_atomic(walker->table, KM_USER0); + return 0; +} + +static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, + struct guest_walker *walker) +{ + mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); +} + +static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, + u64 *shadow_pte, + gpa_t gaddr, + pt_element_t gpte, + u64 access_bits, + int user_fault, + int write_fault, + int *ptwrite, + struct guest_walker *walker, + gfn_t gfn) +{ + hpa_t paddr; + int dirty = gpte & PT_DIRTY_MASK; + u64 spte = *shadow_pte; + int was_rmapped = is_rmap_pte(spte); + + pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, spte, (u64)gpte, access_bits, + write_fault, user_fault, gfn); + + if (write_fault && !dirty) { + pt_element_t *guest_ent, *tmp = NULL; + + if (walker->ptep) + guest_ent = walker->ptep; + else { + tmp = kmap_atomic(walker->page, KM_USER0); + guest_ent = &tmp[walker->index]; + } + + *guest_ent |= PT_DIRTY_MASK; + if (!walker->ptep) + kunmap_atomic(tmp, KM_USER0); + dirty = 1; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); + } + + spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; + spte |= gpte & PT64_NX_MASK; + if (!dirty) + access_bits &= ~PT_WRITABLE_MASK; + + paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); + + spte |= PT_PRESENT_MASK; + if (access_bits & PT_USER_MASK) + spte |= PT_USER_MASK; + + if (is_error_hpa(paddr)) { + spte |= gaddr; + spte |= PT_SHADOW_IO_MARK; + spte &= ~PT_PRESENT_MASK; + set_shadow_pte(shadow_pte, spte); + return; + } + + spte |= paddr; + + if ((access_bits & PT_WRITABLE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + struct kvm_mmu_page *shadow; + + spte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu, gfn); + goto unshadowed; + } + + shadow = kvm_mmu_lookup_page(vcpu, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + access_bits &= ~PT_WRITABLE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; + kvm_x86_ops->tlb_flush(vcpu); + } + if (write_fault) + *ptwrite = 1; + } + } + +unshadowed: + + if (access_bits & PT_WRITABLE_MASK) + mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); + + set_shadow_pte(shadow_pte, spte); + page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); + if (!was_rmapped) + rmap_add(vcpu, shadow_pte); +} + +static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) +{ + access_bits &= gpte; + FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, + gpte, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes) +{ + pt_element_t gpte; + + if (bytes < sizeof(pt_element_t)) + return; + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, + 0, NULL, NULL, + (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); +} + +static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, + u64 *shadow_pte, u64 access_bits, + int user_fault, int write_fault, int *ptwrite, + struct guest_walker *walker, gfn_t gfn) +{ + gpa_t gaddr; + + access_bits &= gpde; + gaddr = (gpa_t)gfn << PAGE_SHIFT; + if (PTTYPE == 32 && is_cpuid_PSE36()) + gaddr |= (gpde & PT32_DIR_PSE36_MASK) << + (32 - PT32_DIR_PSE36_SHIFT); + FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, + gpde, access_bits, user_fault, write_fault, + ptwrite, walker, gfn); +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite) +{ + hpa_t shadow_addr; + int level; + u64 *shadow_ent; + u64 *prev_shadow_ent = NULL; + + if (!is_present_pte(walker->pte)) + return NULL; + + shadow_addr = vcpu->mmu.root_hpa; + level = vcpu->mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + for (; ; level--) { + u32 index = SHADOW_PT_INDEX(addr, level); + struct kvm_mmu_page *shadow_page; + u64 shadow_pte; + int metaphysical; + gfn_t table_gfn; + unsigned hugepage_access = 0; + + shadow_ent = ((u64 *)__va(shadow_addr)) + index; + if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { + if (level == PT_PAGE_TABLE_LEVEL) + break; + shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + prev_shadow_ent = shadow_ent; + continue; + } + + if (level == PT_PAGE_TABLE_LEVEL) + break; + + if (level - 1 == PT_PAGE_TABLE_LEVEL + && walker->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + hugepage_access = walker->pte; + hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; + if (walker->pte & PT64_NX_MASK) + hugepage_access |= (1 << 2); + hugepage_access >>= PT_WRITABLE_SHIFT; + table_gfn = (walker->pte & PT_BASE_ADDR_MASK) + >> PAGE_SHIFT; + } else { + metaphysical = 0; + table_gfn = walker->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, hugepage_access, + shadow_ent); + shadow_addr = __pa(shadow_page->spt); + shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *shadow_ent = shadow_pte; + prev_shadow_ent = shadow_ent; + } + + if (walker->level == PT_DIRECTORY_LEVEL) { + FNAME(set_pde)(vcpu, walker->pte, shadow_ent, + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); + } else { + ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); + FNAME(set_pte)(vcpu, walker->pte, shadow_ent, + walker->inherited_ar, user_fault, write_fault, + ptwrite, walker, walker->gfn); + } + return shadow_ent; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int user_fault = error_code & PFERR_USER_MASK; + int fetch_fault = error_code & PFERR_FETCH_MASK; + struct guest_walker walker; + u64 *shadow_pte; + int write_pt = 0; + int r; + + pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); + kvm_mmu_audit(vcpu, "pre page fault"); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + /* + * Look up the shadow pte for the faulting address. + */ + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, + fetch_fault); + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!r) { + pgprintk("%s: guest page fault\n", __FUNCTION__); + inject_page_fault(vcpu, addr, walker.error_code); + vcpu->last_pt_write_count = 0; /* reset fork detector */ + return 0; + } + + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); + + if (!write_pt) + vcpu->last_pt_write_count = 0; /* reset fork detector */ + + /* + * mmio: emulate if accessible, otherwise its a guest fault. + */ + if (is_io_pte(*shadow_pte)) + return 1; + + ++vcpu->stat.pf_fixed; + kvm_mmu_audit(vcpu, "post page fault (fixed)"); + + return write_pt; +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + + if (r) { + gpa = (gpa_t)walker.gfn << PAGE_SHIFT; + gpa |= vaddr & ~PAGE_MASK; + } + + return gpa; +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef SHADOW_PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_DIR_BASE_ADDR_MASK +#undef PT_MAX_FULL_LEVELS diff --git a/trunk/arch/x86/kvm/segment_descriptor.h b/trunk/drivers/kvm/segment_descriptor.h similarity index 53% rename from trunk/arch/x86/kvm/segment_descriptor.h rename to trunk/drivers/kvm/segment_descriptor.h index 56fc4c873389..71fdf458619a 100644 --- a/trunk/arch/x86/kvm/segment_descriptor.h +++ b/trunk/drivers/kvm/segment_descriptor.h @@ -1,6 +1,3 @@ -#ifndef __SEGMENT_DESCRIPTOR_H -#define __SEGMENT_DESCRIPTOR_H - struct segment_descriptor { u16 limit_low; u16 base_low; @@ -17,13 +14,4 @@ struct segment_descriptor { u8 base_high; } __attribute__((packed)); -#ifdef CONFIG_X86_64 -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct segment_descriptor_64 { - struct segment_descriptor s; - u32 base_higher; - u32 pad_zero; -}; -#endif -#endif diff --git a/trunk/arch/x86/kvm/svm.c b/trunk/drivers/kvm/svm.c similarity index 84% rename from trunk/arch/x86/kvm/svm.c rename to trunk/drivers/kvm/svm.c index de755cb1431d..ced4ac1955db 100644 --- a/trunk/arch/x86/kvm/svm.c +++ b/trunk/drivers/kvm/svm.c @@ -13,11 +13,10 @@ * the COPYING file in the top-level directory. * */ -#include #include "kvm_svm.h" +#include "x86_emulate.h" #include "irq.h" -#include "mmu.h" #include #include @@ -43,6 +42,9 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 +#define KVM_EFER_LMA (1 << 10) +#define KVM_EFER_LME (1 << 8) + #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) @@ -100,20 +102,20 @@ static inline u32 svm_has(u32 feat) static inline u8 pop_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); return irq; } static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) { - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); } static inline void clgi(void) @@ -182,30 +184,35 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (!(efer & EFER_LMA)) - efer &= ~EFER_LME; + if (!(efer & KVM_EFER_LMA)) + efer &= ~KVM_EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; - vcpu->arch.shadow_efer = efer; + vcpu->shadow_efer = efer; } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) +static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + GP_VECTOR; svm->vmcb->control.event_inj_err = error_code; } -static bool svm_exception_injected(struct kvm_vcpu *vcpu) +static void inject_ud(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); + to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_TYPE_EXEPT | + UD_VECTOR; +} - return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); +static int is_page_fault(uint32_t info) +{ + info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); } static int is_external_interrupt(u32 info) @@ -222,16 +229,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) + if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", __FUNCTION__, svm->vmcb->save.rip, svm->next_rip); + } - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + vcpu->rip = svm->vmcb->save.rip = svm->next_rip; svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - vcpu->arch.interrupt_window_open = 1; + vcpu->interrupt_window_open = 1; } static int has_svm(void) @@ -304,7 +312,7 @@ static void svm_hardware_enable(void *garbage) svm_data->next_asid = svm_data->max_asid + 1; svm_features = cpuid_edx(SVM_CPUID_FUNC); - asm volatile ("sgdt %0" : "=m"(gdt_descr)); + asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); gdt = (struct desc_struct *)gdt_descr.address; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); @@ -450,13 +458,11 @@ static void init_vmcb(struct vmcb *vmcb) control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK | - INTERCEPT_CR8_MASK; + INTERCEPT_CR4_MASK; control->intercept_cr_write = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK | - INTERCEPT_CR8_MASK; + INTERCEPT_CR4_MASK; control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | @@ -470,8 +476,7 @@ static void init_vmcb(struct vmcb *vmcb) INTERCEPT_DR5_MASK | INTERCEPT_DR7_MASK; - control->intercept_exceptions = (1 << PF_VECTOR) | - (1 << UD_VECTOR); + control->intercept_exceptions = 1 << PF_VECTOR; control->intercept = (1ULL << INTERCEPT_INTR) | @@ -538,7 +543,8 @@ static void init_vmcb(struct vmcb *vmcb) init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); save->efer = MSR_EFER_SVME_MASK; - save->dr6 = 0xffff0ff0; + + save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; @@ -552,7 +558,7 @@ static void init_vmcb(struct vmcb *vmcb) /* rdx = ?? */ } -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -560,11 +566,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) if (vcpu->vcpu_id != 0) { svm->vmcb->save.rip = 0; - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; + svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; + svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; } - - return 0; } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -583,6 +587,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + if (irqchip_in_kernel(kvm)) { + err = kvm_create_lapic(&svm->vcpu); + if (err < 0) + goto free_svm; + } + page = alloc_page(GFP_KERNEL); if (!page) { err = -ENOMEM; @@ -598,9 +608,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (svm->vcpu.vcpu_id == 0) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; @@ -634,7 +644,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * increasing TSC. */ rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; + delta = vcpu->host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; kvm_migrate_apic_timer(vcpu); @@ -649,11 +659,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int i; - ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - rdtscll(vcpu->arch.host_tsc); + rdtscll(vcpu->host_tsc); + kvm_put_guest_fpu(vcpu); } static void svm_vcpu_decache(struct kvm_vcpu *vcpu) @@ -664,17 +674,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; + vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->rip = svm->vmcb->save.rip; } static void svm_decache_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; + svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->rip; } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -772,24 +782,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_svm *svm = to_svm(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->shadow_efer & KVM_EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + vcpu->shadow_efer |= KVM_EFER_LMA; + svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; } - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.shadow_efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { + vcpu->shadow_efer &= ~KVM_EFER_LMA; + svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); } } #endif - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; } - vcpu->arch.cr0 = cr0; + vcpu->cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; @@ -797,7 +807,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - vcpu->arch.cr4 = cr4; + vcpu->cr4 = cr4; to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; } @@ -902,7 +912,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, svm->db_regs[dr] = value; return; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) { + if (vcpu->cr4 & X86_CR4_DE) { *exception = UD_VECTOR; return; } @@ -928,30 +938,51 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; + enum emulation_result er; + int r; if (!irqchip_in_kernel(kvm) && is_external_interrupt(exit_int_info)) push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + mutex_lock(&kvm->lock); + fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); -} + r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); + if (r < 0) { + mutex_unlock(&kvm->lock); + return r; + } + if (!r) { + mutex_unlock(&kvm->lock); + return 1; + } + er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, + error_code); + mutex_unlock(&kvm->lock); -static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - int er; + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++svm->vcpu.stat.mmio_exits; + return 0; + case EMULATE_FAIL: + kvm_report_emulation_failure(&svm->vcpu, "pagetable"); + break; + default: + BUG(); + } - er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); - if (er != EMULATE_DONE) - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; } static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) + if (!(svm->vcpu.cr0 & X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; svm->vcpu.fpu_active = 1; @@ -973,7 +1004,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ + u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? int size, down, in, string, rep; unsigned port; @@ -984,8 +1015,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) string = (io_info & SVM_IOIO_STR_MASK) != 0; if (string) { - if (emulate_instruction(&svm->vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1015,14 +1045,13 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { svm->next_rip = svm->vmcb->save.rip + 3; skip_emulated_instruction(&svm->vcpu); - kvm_emulate_hypercall(&svm->vcpu); - return 1; + return kvm_hypercall(&svm->vcpu, kvm_run); } static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); + inject_ud(&svm->vcpu); return 1; } @@ -1044,20 +1073,11 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); return 1; } -static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); - if (irqchip_in_kernel(svm->vcpu.kvm)) - return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; -} - static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1104,14 +1124,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; u64 data; if (svm_get_msr(&svm->vcpu, ecx, &data)) - kvm_inject_gp(&svm->vcpu, 0); + svm_inject_gp(&svm->vcpu, 0); else { svm->vmcb->save.rax = data & 0xffffffff; - svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; + svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; svm->next_rip = svm->vmcb->save.rip + 2; skip_emulated_instruction(&svm->vcpu); } @@ -1156,20 +1176,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_IA32_SYSENTER_ESP: svm->vmcb->save.sysenter_esp = data; break; - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - /* - * only support writing 0 to the performance counters for now - * to make Windows happy. Should be replaced by a real - * performance counter emulation later. - */ - if (data != 0) - goto unhandled; - break; default: - unhandled: return kvm_set_msr_common(vcpu, ecx, data); } return 0; @@ -1177,12 +1184,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; u64 data = (svm->vmcb->save.rax & -1u) - | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); svm->next_rip = svm->vmcb->save.rip + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) - kvm_inject_gp(&svm->vcpu, 0); + svm_inject_gp(&svm->vcpu, 0); else skip_emulated_instruction(&svm->vcpu); return 1; @@ -1206,7 +1213,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, * possible */ if (kvm_run->request_interrupt_window && - !svm->vcpu.arch.irq_summary) { + !svm->vcpu.irq_summary) { ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; @@ -1220,12 +1227,10 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_READ_CR0] = emulate_on_interception, [SVM_EXIT_READ_CR3] = emulate_on_interception, [SVM_EXIT_READ_CR4] = emulate_on_interception, - [SVM_EXIT_READ_CR8] = emulate_on_interception, /* for now: */ [SVM_EXIT_WRITE_CR0] = emulate_on_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, [SVM_EXIT_READ_DR0] = emulate_on_interception, [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, @@ -1236,7 +1241,6 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WRITE_DR3] = emulate_on_interception, [SVM_EXIT_WRITE_DR5] = emulate_on_interception, [SVM_EXIT_WRITE_DR7] = emulate_on_interception, - [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_INTR] = nop_on_interception, @@ -1289,7 +1293,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) exit_code); if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || !svm_exit_handlers[exit_code]) { + || svm_exit_handlers[exit_code] == 0) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; kvm_run->hw.hardware_exit_reason = exit_code; return 0; @@ -1303,7 +1307,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) int cpu = raw_smp_processor_id(); struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ + svm_data->tss_desc->type = 9; //available 32/64-bit TSS load_TR_desc(); } @@ -1344,6 +1348,7 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; int intr_vector = -1; + kvm_inject_pending_timer_irqs(vcpu); if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { intr_vector = vmcb->control.exit_int_info & @@ -1383,20 +1388,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm) push_irq(&svm->vcpu, control->int_vector); } - svm->vcpu.arch.interrupt_window_open = + svm->vcpu.interrupt_window_open = !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); } static void svm_do_inject_vector(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); svm_inject_irq(svm, irq); } @@ -1406,11 +1411,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - svm->vcpu.arch.interrupt_window_open = + svm->vcpu.interrupt_window_open = (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && (svm->vmcb->save.rflags & X86_EFLAGS_IF)); - if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) + if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. */ @@ -1419,18 +1424,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, /* * Interrupts blocked. Wait for unblock. */ - if (!svm->vcpu.arch.interrupt_window_open && - (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) + if (!svm->vcpu.interrupt_window_open && + (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { control->intercept |= 1ULL << INTERCEPT_VINTR; - else + } else control->intercept &= ~(1ULL << INTERCEPT_VINTR); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - static void save_db_regs(unsigned long *db_regs) { asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); @@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_cr2 = kvm_read_cr2(); svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); - svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm->vmcb->save.cr2 = vcpu->cr2; if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); @@ -1486,9 +1486,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm volatile ( #ifdef CONFIG_X86_64 - "push %%rbp; \n\t" + "push %%rbx; push %%rcx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" #else - "push %%ebp; \n\t" + "push %%ebx; push %%ecx; push %%edx;" + "push %%esi; push %%edi; push %%ebp;" #endif #ifdef CONFIG_X86_64 @@ -1550,7 +1554,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - "pop %%rbp; \n\t" + "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" #else "mov %%ebx, %c[rbx](%[svm]) \n\t" "mov %%ecx, %c[rcx](%[svm]) \n\t" @@ -1559,40 +1566,34 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%edi, %c[rdi](%[svm]) \n\t" "mov %%ebp, %c[rbp](%[svm]) \n\t" - "pop %%ebp; \n\t" + "pop %%ebp; pop %%edi; pop %%esi;" + "pop %%edx; pop %%ecx; pop %%ebx; \n\t" #endif : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) + [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) #ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) + ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), + [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) #endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" -#endif - ); + : "cc", "memory" ); if ((svm->vmcb->save.dr7 & 0xff)) load_db_regs(svm->host_db_regs); - vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->cr2 = svm->vmcb->save.cr2; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); @@ -1626,6 +1627,34 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) } } +static void svm_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + uint32_t err_code) +{ + struct vcpu_svm *svm = to_svm(vcpu); + uint32_t exit_int_info = svm->vmcb->control.exit_int_info; + + ++vcpu->stat.pf_guest; + + if (is_page_fault(exit_int_info)) { + + svm->vmcb->control.event_inj_err = 0; + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + DF_VECTOR; + return; + } + vcpu->cr2 = addr; + svm->vmcb->save.cr2 = addr; + svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | + SVM_EVTINJ_VALID_ERR | + SVM_EVTINJ_TYPE_EXEPT | + PF_VECTOR; + svm->vmcb->control.event_inj_err = err_code; +} + + static int is_disabled(void) { u64 vm_cr; @@ -1646,6 +1675,7 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xd9; + hypercall[3] = 0xc3; } static void svm_check_processor_compat(void *rtn) @@ -1653,11 +1683,6 @@ static void svm_check_processor_compat(void *rtn) *(int *)rtn = 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1666,7 +1691,6 @@ static struct kvm_x86_ops svm_x86_ops = { .check_processor_compatibility = svm_check_processor_compat, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, @@ -1701,6 +1725,9 @@ static struct kvm_x86_ops svm_x86_ops = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, + .inject_page_fault = svm_inject_page_fault, + + .inject_gp = svm_inject_gp, .run = svm_vcpu_run, .handle_exit = handle_exit, @@ -1708,23 +1735,19 @@ static struct kvm_x86_ops svm_x86_ops = { .patch_hypercall = svm_patch_hypercall, .get_irq = svm_get_irq, .set_irq = svm_set_irq, - .queue_exception = svm_queue_exception, - .exception_injected = svm_exception_injected, .inject_pending_irq = svm_intr_assist, .inject_pending_vectors = do_interrupt_requests, - - .set_tss_addr = svm_set_tss_addr, }; static int __init svm_init(void) { - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), + return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) { - kvm_exit(); + kvm_exit_x86(); } module_init(svm_init) diff --git a/trunk/arch/x86/kvm/svm.h b/trunk/drivers/kvm/svm.h similarity index 98% rename from trunk/arch/x86/kvm/svm.h rename to trunk/drivers/kvm/svm.h index 5fd50491b555..3b1b0f35b6cb 100644 --- a/trunk/arch/x86/kvm/svm.h +++ b/trunk/drivers/kvm/svm.h @@ -204,7 +204,6 @@ struct __attribute__ ((__packed__)) vmcb { #define INTERCEPT_CR0_MASK 1 #define INTERCEPT_CR3_MASK (1 << 3) #define INTERCEPT_CR4_MASK (1 << 4) -#define INTERCEPT_CR8_MASK (1 << 8) #define INTERCEPT_DR0_MASK 1 #define INTERCEPT_DR1_MASK (1 << 1) @@ -312,7 +311,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXIT_ERR -1 -#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ +#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" diff --git a/trunk/arch/x86/kvm/vmx.c b/trunk/drivers/kvm/vmx.c similarity index 75% rename from trunk/arch/x86/kvm/vmx.c rename to trunk/drivers/kvm/vmx.c index ad36447e696e..5b397b6c9f93 100644 --- a/trunk/arch/x86/kvm/vmx.c +++ b/trunk/drivers/kvm/vmx.c @@ -15,18 +15,17 @@ * */ +#include "kvm.h" +#include "x86_emulate.h" #include "irq.h" #include "vmx.h" #include "segment_descriptor.h" -#include "mmu.h" -#include #include #include #include #include #include -#include #include #include @@ -34,9 +33,6 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int bypass_guest_pf = 1; -module_param(bypass_guest_pf, bool, 0); - struct vmcs { u32 revision_id; u32 abort; @@ -47,7 +43,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; int launched; u8 fail; - u32 idt_vectoring_info; struct kvm_msr_entry *guest_msrs; struct kvm_msr_entry *host_msrs; int nmsrs; @@ -62,15 +57,8 @@ struct vcpu_vmx { u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - int guest_efer_loaded; - } host_state; - struct { - struct { - bool pending; - u8 vector; - unsigned rip; - } irq; - } rmode; + }host_state; + }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -86,13 +74,14 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; +#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) + static struct vmcs_config { int size; int order; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; } vmcs_config; @@ -149,6 +138,18 @@ static void save_msrs(struct kvm_msr_entry *e, int n) rdmsrl(e[i].index, e[i].data); } +static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) +{ + return (u64)msr.data & EFER_SAVE_RESTORE_BITS; +} + +static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) +{ + int efer_offset = vmx->msr_offset_efer; + return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != + msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); +} + static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -163,13 +164,6 @@ static inline int is_no_device(u32 intr_info) (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_invalid_opcode(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); -} - static inline int is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -186,24 +180,6 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm) return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); } -static inline int cpu_has_secondary_exec_ctrls(void) -{ - return (vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); -} - -static inline bool cpu_has_vmx_virtualize_apic_accesses(void) -{ - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); -} - -static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) -{ - return ((cpu_has_vmx_virtualize_apic_accesses()) && - (irqchip_in_kernel(kvm))); -} - static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -246,14 +222,16 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); + rdtscll(vmx->vcpu.host_tsc); } static void vcpu_clear(struct vcpu_vmx *vmx) { - if (vmx->vcpu.cpu == -1) - return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); + if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, + vmx, 0, 1); + else + __vcpu_clear(vmx); vmx->launched = 0; } @@ -297,7 +275,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) u8 error; asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + : "=q"(error) : "a"(value), "d"(field) : "cc" ); if (unlikely(error)) vmwrite_error(field, value); } @@ -337,12 +315,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + eb = 1u << PF_VECTOR; if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) eb |= 1u << 1; - if (vcpu->arch.rmode.active) + if (vcpu->rmode.active) eb = ~0; vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -366,42 +344,16 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { + u64 trans_efer; int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; - u64 ignore_bits; - if (efer_offset < 0) - return; - /* - * NX is emulated; LMA and LME handled by hardware; SCE meaninless - * outside long mode - */ - ignore_bits = EFER_NX | EFER_SCE; -#ifdef CONFIG_X86_64 - ignore_bits |= EFER_LMA | EFER_LME; - /* SCE is meaningful only in long mode on Intel */ - if (guest_efer & EFER_LMA) - ignore_bits &= ~(u64)EFER_SCE; -#endif - if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; - - vmx->host_state.guest_efer_loaded = 1; - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); + trans_efer = vmx->host_msrs[efer_offset].data; + trans_efer &= ~EFER_SAVE_RESTORE_BITS; + trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); + wrmsrl(MSR_EFER, trans_efer); vmx->vcpu.stat.efer_reload++; } -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } -} - static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -441,13 +393,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) + if (is_long_mode(&vmx->vcpu)) { save_msrs(vmx->host_msrs + vmx->msr_offset_kernel_gs_base, 1); - + } #endif load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); + if (msr_efer_need_save_restore(vmx)) + load_transition_efer(vmx); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -457,7 +410,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) if (!vmx->host_state.loaded) return; - ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) load_fs(vmx->host_state.fs_sel); @@ -477,7 +429,8 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_tss(); save_msrs(vmx->guest_msrs, vmx->save_nmsrs); load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); + if (msr_efer_need_save_restore(vmx)) + load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); } /* @@ -527,7 +480,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Make sure the time stamp counter is monotonous. */ rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; + delta = vcpu->host_tsc - tsc_this; vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); } } @@ -535,6 +488,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_load_host_state(to_vmx(vcpu)); + kvm_put_guest_fpu(vcpu); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -543,7 +497,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) return; vcpu->fpu_active = 1; vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) + if (vcpu->cr0 & X86_CR0_TS) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); } @@ -569,7 +523,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.active) + if (vcpu->rmode.active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -591,25 +545,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) if (interruptibility & 3) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility & ~3); - vcpu->arch.interrupt_window_open = 1; + vcpu->interrupt_window_open = 1; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) +static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) { + printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", + vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) - | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); -} - -static bool vmx_exception_injected(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + GP_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); } /* @@ -660,7 +608,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) * if efer.sce is enabled. */ index = __find_msr_index(vmx, MSR_K6_STAR); - if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } #endif @@ -764,10 +712,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) #ifdef CONFIG_X86_64 case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); + if (vmx->host_state.loaded) load_transition_efer(vmx); - } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -804,12 +750,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) /* * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. + * registers to be accessed by indexing vcpu->regs. */ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) { - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); + vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->rip = vmcs_readl(GUEST_RIP); } /* @@ -818,8 +764,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) */ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->rip); } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -862,15 +808,14 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); u32 idtv_info_field; - idtv_info_field = vmx->idt_vectoring_info; + idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (idtv_info_field & INTR_INFO_VALID_MASK) { if (is_external_interrupt(idtv_info_field)) return idtv_info_field & VECTORING_INFO_VECTOR_MASK; else - printk(KERN_DEBUG "pending exception: not handled yet\n"); + printk("pending exception: not handled yet\n"); } return -1; } @@ -918,7 +863,7 @@ static void hardware_disable(void *garbage) } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) + u32 msr, u32* result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -942,7 +887,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 min, opt; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; - u32 _cpu_based_2nd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; @@ -960,8 +904,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING; - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; +#ifdef CONFIG_X86_64 + opt = CPU_BASED_TPR_SHADOW; +#else + opt = 0; +#endif if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -970,19 +917,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING; #endif - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min = 0; - opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; - } -#ifndef CONFIG_X86_64 - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; -#endif min = 0; #ifdef CONFIG_X86_64 @@ -1020,7 +954,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; - vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -1110,15 +1043,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; - vcpu->arch.rmode.active = 0; + vcpu->rmode.active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1126,10 +1059,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1139,14 +1072,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); } -static gva_t rmode_tss_base(struct kvm *kvm) +static gva_t rmode_tss_base(struct kvm* kvm) { - if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; - return base_gfn << PAGE_SHIFT; - } - return kvm->arch.tss_addr; + gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; } static void fix_rmode_seg(int seg, struct kvm_save_segment *save) @@ -1157,8 +1086,7 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) save->base = vmcs_readl(sf->base); save->limit = vmcs_read32(sf->limit); save->ar = vmcs_read32(sf->ar_bytes); - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xfffff); + vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); vmcs_write32(sf->limit, 0xffff); vmcs_write32(sf->ar_bytes, 0xf3); } @@ -1167,20 +1095,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; - vcpu->arch.rmode.active = 1; + vcpu->rmode.active = 1; - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1198,10 +1125,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); kvm_mmu_reset_context(vcpu); init_rmode_tss(vcpu->kvm); @@ -1222,7 +1149,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | AR_TYPE_BUSY_64_TSS); } - vcpu->arch.shadow_efer |= EFER_LMA; + vcpu->shadow_efer |= EFER_LMA; find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; vmcs_write32(VM_ENTRY_CONTROLS, @@ -1232,7 +1159,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vcpu->arch.shadow_efer &= ~EFER_LMA; + vcpu->shadow_efer &= ~EFER_LMA; vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) @@ -1243,22 +1170,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + vcpu->cr4 &= KVM_GUEST_CR4_MASK; + vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) + if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) + if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 - if (vcpu->arch.shadow_efer & EFER_LME) { + if (vcpu->shadow_efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) @@ -1269,7 +1196,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); - vcpu->arch.cr0 = cr0; + vcpu->cr0 = cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) vmx_fpu_activate(vcpu); @@ -1278,16 +1205,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { vmcs_writel(GUEST_CR3, cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) + if (vcpu->cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? + vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); - vcpu->arch.cr4 = cr4; + vcpu->cr4 = cr4; } #ifdef CONFIG_X86_64 @@ -1297,7 +1224,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - vcpu->arch.shadow_efer = efer; + vcpu->shadow_efer = efer; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1374,17 +1301,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + if (vcpu->rmode.active && seg == VCPU_SREG_TR) { + vcpu->rmode.tr.selector = var->selector; + vcpu->rmode.tr.base = var->base; + vcpu->rmode.tr.limit = var->limit; + vcpu->rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.active && var->s) { + if (vcpu->rmode.active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1428,38 +1355,36 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } -static int init_rmode_tss(struct kvm *kvm) +static int init_rmode_tss(struct kvm* kvm) { + struct page *p1, *p2, *p3; gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; - u16 data = 0; - int ret = 0; - int r; + char *page; - down_read(¤t->mm->mmap_sem); - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); - if (r < 0) - goto out; + p1 = gfn_to_page(kvm, fn++); + p2 = gfn_to_page(kvm, fn++); + p3 = gfn_to_page(kvm, fn); - ret = 1; -out: - up_read(¤t->mm->mmap_sem); - return ret; + if (!p1 || !p2 || !p3) { + kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); + return 0; + } + + page = kmap_atomic(p1, KM_USER0); + clear_page(page); + *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p2, KM_USER0); + clear_page(page); + kunmap_atomic(page, KM_USER0); + + page = kmap_atomic(p3, KM_USER0); + clear_page(page); + *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; + kunmap_atomic(page, KM_USER0); + + return 1; } static void seg_setup(int seg) @@ -1472,27 +1397,6 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, 0x93); } -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct kvm_userspace_memory_region kvm_userspace_mem; - int r = 0; - - down_write(¤t->mm->mmap_sem); - if (kvm->arch.apic_access_page) - goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); -out: - up_write(¤t->mm->mmap_sem); - return r; -} - /* * Sets up the vmcs for emulated real mode. */ @@ -1503,15 +1407,92 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) unsigned long a; struct descriptor_table dt; int i; + int ret = 0; unsigned long kvm_vmx_return; + u64 msr; u32 exec_control; + if (!init_rmode_tss(vmx->vcpu.kvm)) { + ret = -ENOMEM; + goto out; + } + + vmx->vcpu.rmode.active = 0; + + vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + set_cr8(&vmx->vcpu, 0); + msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vmx->vcpu.vcpu_id == 0) + msr |= MSR_IA32_APICBASE_BSP; + kvm_set_apic_base(&vmx->vcpu, msr); + + fx_init(&vmx->vcpu); + + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + if (vmx->vcpu.vcpu_id == 0) { + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + } else { + vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); + vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); + } + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + + vmcs_writel(GUEST_RFLAGS, 0x02); + if (vmx->vcpu.vcpu_id == 0) + vmcs_writel(GUEST_RIP, 0xfff0); + else + vmcs_writel(GUEST_RIP, 0); + vmcs_writel(GUEST_RSP, 0); + + //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + /* I/O */ vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + guest_write_tsc(0); + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmcs_config.pin_based_exec_ctrl); @@ -1526,16 +1507,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ @@ -1563,7 +1536,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ - asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); @@ -1594,145 +1567,97 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } + setup_msrs(vmx); + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); /* 22.2.1, 20.8.1 */ vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + +#ifdef CONFIG_X86_64 + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->vcpu.apic->regs_page)); + vmcs_write32(TPR_THRESHOLD, 0); +#endif + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) - return -ENOMEM; + vmx->vcpu.cr0 = 0x60000010; + vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode + vmx_set_cr4(&vmx->vcpu, 0); +#ifdef CONFIG_X86_64 + vmx_set_efer(&vmx->vcpu, 0); +#endif + vmx_fpu_activate(&vmx->vcpu); + update_exception_bitmap(&vmx->vcpu); return 0; + +out: + return ret; } -static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; - int ret; - if (!init_rmode_tss(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } - - vmx->vcpu.arch.rmode.active = 0; - - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); - - fx_init(&vmx->vcpu); + vmx_vcpu_setup(vmx); +} - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (vmx->vcpu.vcpu_id == 0) { - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); +static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) +{ + u16 ent[2]; + u16 cs; + u16 ip; + unsigned long flags; + unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); + u16 sp = vmcs_readl(GUEST_RSP); + u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); + + if (sp > ss_limit || sp < 6 ) { + vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", + __FUNCTION__, + vmcs_readl(GUEST_RSP), + vmcs_readl(GUEST_SS_BASE), + vmcs_read32(GUEST_SS_LIMIT)); + return; } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - - vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); - else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); - - /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ - vmcs_writel(GUEST_DR7, 0x400); - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, 0); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - - guest_write_tsc(0); - - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - - setup_msrs(vmx); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - - if (cpu_has_vmx_tpr_shadow()) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); - vmcs_write32(TPR_THRESHOLD, 0); + if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != + X86EMUL_CONTINUE) { + vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); + return; } - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + flags = vmcs_readl(GUEST_RFLAGS); + cs = vmcs_readl(GUEST_CS_BASE) >> 4; + ip = vmcs_readl(GUEST_RIP); - vmx->vcpu.arch.cr0 = 0x60000010; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ - vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 - vmx_set_efer(&vmx->vcpu, 0); -#endif - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); - return 0; + if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || + emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || + emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { + vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); + return; + } -out: - return ret; + vmcs_writel(GUEST_RFLAGS, flags & + ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); + vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; + vmcs_writel(GUEST_CS_BASE, ent[1] << 4); + vmcs_writel(GUEST_RIP, ent[0]); + vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); } static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vcpu->arch.rmode.active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + if (vcpu->rmode.active) { + inject_rmode_irq(vcpu, irq); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -1741,13 +1666,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->arch.irq_summary); - int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int word_index = __ffs(vcpu->irq_summary); + int bit_index = __ffs(vcpu->irq_pending[word_index]); int irq = word_index * BITS_PER_LONG + bit_index; - clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); - if (!vcpu->arch.irq_pending[word_index]) - clear_bit(word_index, &vcpu->arch.irq_summary); + clear_bit(bit_index, &vcpu->irq_pending[word_index]); + if (!vcpu->irq_pending[word_index]) + clear_bit(word_index, &vcpu->irq_summary); vmx_inject_irq(vcpu, irq); } @@ -1757,12 +1682,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, { u32 cpu_based_vm_exec_control; - vcpu->arch.interrupt_window_open = + vcpu->interrupt_window_open = ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && + if (vcpu->interrupt_window_open && + vcpu->irq_summary && !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) /* * If interrupts enabled, and not blocked by sti or mov ss. Good. @@ -1770,8 +1695,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, kvm_do_inject_irq(vcpu); cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - if (!vcpu->arch.interrupt_window_open && - (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) + if (!vcpu->interrupt_window_open && + (vcpu->irq_summary || kvm_run->request_interrupt_window)) /* * Interrupts blocked. Wait for unblock. */ @@ -1781,23 +1706,6 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = 8, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - - ret = kvm_set_memory_region(kvm, &tss_mem, 0); - if (ret) - return ret; - kvm->arch.tss_addr = addr; - return 0; -} - static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) { struct kvm_guest_debug *dbg = &vcpu->guest_debug; @@ -1819,7 +1727,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) + if (!vcpu->rmode.active) return 0; /* @@ -1827,31 +1735,32 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) return 1; return 0; } static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info, error_code; unsigned long cr2, rip; u32 vect_info; enum emulation_result er; + int r; - vect_info = vmx->idt_vectoring_info; + vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) + !is_page_fault(intr_info)) { printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + } if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; - set_bit(irq, vcpu->arch.irq_pending); - set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ @@ -1862,34 +1771,52 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - error_code = 0; rip = vmcs_readl(GUEST_RIP); if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); - return kvm_mmu_page_fault(vcpu, cr2, error_code); + + mutex_lock(&vcpu->kvm->lock); + r = kvm_mmu_page_fault(vcpu, cr2, error_code); + if (r < 0) { + mutex_unlock(&vcpu->kvm->lock); + return r; + } + if (!r) { + mutex_unlock(&vcpu->kvm->lock); + return 1; + } + + er = emulate_instruction(vcpu, kvm_run, cr2, error_code); + mutex_unlock(&vcpu->kvm->lock); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++vcpu->stat.mmio_exits; + return 0; + case EMULATE_FAIL: + kvm_report_emulation_failure(vcpu, "pagetable"); + break; + default: + BUG(); + } } - if (vcpu->arch.rmode.active && + if (vcpu->rmode.active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; + if (vcpu->halt_request) { + vcpu->halt_request = 0; return kvm_emulate_halt(vcpu); } return 1; } - if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == - (INTR_TYPE_EXCEPTION | 1)) { + if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; } @@ -1923,8 +1850,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1947,6 +1873,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xc1; + hypercall[3] = 0xc3; } static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -1963,25 +1890,23 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) switch (cr) { case 0: vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->arch.regs[reg]); + set_cr0(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); return 1; case 3: vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->arch.regs[reg]); + set_cr3(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); return 1; case 4: vcpu_load_rsp_rip(vcpu); - set_cr4(vcpu, vcpu->arch.regs[reg]); + set_cr4(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - set_cr8(vcpu, vcpu->arch.regs[reg]); + set_cr8(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; }; @@ -1989,8 +1914,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 2: /* clts */ vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + vcpu->cr0 &= ~X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); vmx_fpu_activate(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -1998,13 +1923,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) switch (cr) { case 3: vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; + vcpu->regs[reg] = vcpu->cr3; vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = get_cr8(vcpu); + vcpu->regs[reg] = get_cr8(vcpu); vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -2050,7 +1975,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + vcpu->regs[reg] = val; } else { /* mov to dr */ } @@ -2067,29 +1992,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; u64 data; if (vmx_get_msr(vcpu, ecx, &data)) { - kvm_inject_gp(vcpu, 0); + vmx_inject_gp(vcpu, 0); return 1; } /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + vcpu->regs[VCPU_REGS_RAX] = data & -1u; + vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; skip_emulated_instruction(vcpu); return 1; } static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + u32 ecx = vcpu->regs[VCPU_REGS_RCX]; + u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); if (vmx_set_msr(vcpu, ecx, data) != 0) { - kvm_inject_gp(vcpu, 0); + vmx_inject_gp(vcpu, 0); return 1; } @@ -2117,7 +2042,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, * possible */ if (kvm_run->request_interrupt_window && - !vcpu->arch.irq_summary) { + !vcpu->irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; ++vcpu->stat.irq_window_exits; return 0; @@ -2134,35 +2059,7 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); - kvm_emulate_hypercall(vcpu); - return 1; -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - skip_emulated_instruction(vcpu); - /* TODO: Add support for VT-d/pass-through device */ - return 1; -} - -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - u64 exit_qualification; - enum emulation_result er; - unsigned long offset; - - exit_qualification = vmcs_read64(EXIT_QUALIFICATION); - offset = exit_qualification & 0xffful; - - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - - if (er != EMULATE_DONE) { - printk(KERN_ERR - "Fail to handle apic access vmexit! Offset is 0x%lx\n", - offset); - return -ENOTSUPP; - } - return 1; + return kvm_hypercall(vcpu, kvm_run); } /* @@ -2184,9 +2081,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, - [EXIT_REASON_APIC_ACCESS] = handle_apic_access, - [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold }; static const int kvm_vmx_max_exit_handlers = @@ -2198,9 +2093,9 @@ static const int kvm_vmx_max_exit_handlers = */ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { + u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); u32 exit_reason = vmcs_read32(VM_EXIT_REASON); struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 vectoring_info = vmx->idt_vectoring_info; if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -2209,8 +2104,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } - if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - exit_reason != EXIT_REASON_EXCEPTION_NMI) + if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && + exit_reason != EXIT_REASON_EXCEPTION_NMI ) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " "exit reason is 0x%x\n", __FUNCTION__, exit_reason); if (exit_reason < kvm_vmx_max_exit_handlers @@ -2255,38 +2150,26 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); u32 idtv_info_field, intr_info_field; int has_ext_irq, interrupt_window_open; int vector; + kvm_inject_pending_timer_irqs(vcpu); update_tpr_threshold(vcpu); has_ext_irq = kvm_cpu_has_interrupt(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; + idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (intr_info_field & INTR_INFO_VALID_MASK) { if (idtv_info_field & INTR_INFO_VALID_MASK) { /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); + printk(KERN_ERR "Fault when IDT_Vectoring\n"); } if (has_ext_irq) enable_irq_window(vcpu); return; } if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - - vmx_inject_irq(vcpu, vect); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); - return; - } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); @@ -2311,29 +2194,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) enable_irq_window(vcpu); } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) -{ - vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) - return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - vmx->idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; -} - static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2344,47 +2204,50 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) */ vmcs_writel(HOST_CR0, read_cr0()); - asm( + asm ( /* Store host registers */ #ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" + "push %%rax; push %%rbx; push %%rdx;" + "push %%rsi; push %%rdi; push %%rbp;" + "push %%r8; push %%r9; push %%r10; push %%r11;" + "push %%r12; push %%r13; push %%r14; push %%r15;" "push %%rcx \n\t" + ASM_VMX_VMWRITE_RSP_RDX "\n\t" #else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif + "pusha; push %%ecx \n\t" ASM_VMX_VMWRITE_RSP_RDX "\n\t" +#endif /* Check if vmlaunch of vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" + "cmp $0, %1 \n\t" /* Load guest registers. Don't clobber flags. */ #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" + "mov %c[cr2](%3), %%rax \n\t" "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ + "mov %c[rax](%3), %%rax \n\t" + "mov %c[rbx](%3), %%rbx \n\t" + "mov %c[rdx](%3), %%rdx \n\t" + "mov %c[rsi](%3), %%rsi \n\t" + "mov %c[rdi](%3), %%rdi \n\t" + "mov %c[rbp](%3), %%rbp \n\t" + "mov %c[r8](%3), %%r8 \n\t" + "mov %c[r9](%3), %%r9 \n\t" + "mov %c[r10](%3), %%r10 \n\t" + "mov %c[r11](%3), %%r11 \n\t" + "mov %c[r12](%3), %%r12 \n\t" + "mov %c[r13](%3), %%r13 \n\t" + "mov %c[r14](%3), %%r14 \n\t" + "mov %c[r15](%3), %%r15 \n\t" + "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ #else - "mov %c[cr2](%0), %%eax \n\t" + "mov %c[cr2](%3), %%eax \n\t" "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ + "mov %c[rax](%3), %%eax \n\t" + "mov %c[rbx](%3), %%ebx \n\t" + "mov %c[rdx](%3), %%edx \n\t" + "mov %c[rsi](%3), %%esi \n\t" + "mov %c[rdi](%3), %%edi \n\t" + "mov %c[rbp](%3), %%ebp \n\t" + "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ #endif /* Enter guest mode */ "jne .Llaunched \n\t" @@ -2394,79 +2257,72 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" + "xchg %3, (%%rsp) \n\t" + "mov %%rax, %c[rax](%3) \n\t" + "mov %%rbx, %c[rbx](%3) \n\t" + "pushq (%%rsp); popq %c[rcx](%3) \n\t" + "mov %%rdx, %c[rdx](%3) \n\t" + "mov %%rsi, %c[rsi](%3) \n\t" + "mov %%rdi, %c[rdi](%3) \n\t" + "mov %%rbp, %c[rbp](%3) \n\t" + "mov %%r8, %c[r8](%3) \n\t" + "mov %%r9, %c[r9](%3) \n\t" + "mov %%r10, %c[r10](%3) \n\t" + "mov %%r11, %c[r11](%3) \n\t" + "mov %%r12, %c[r12](%3) \n\t" + "mov %%r13, %c[r13](%3) \n\t" + "mov %%r14, %c[r14](%3) \n\t" + "mov %%r15, %c[r15](%3) \n\t" "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" + "mov %%rax, %c[cr2](%3) \n\t" + "mov (%%rsp), %3 \n\t" - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" + "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" + "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" + "pop %%rbp; pop %%rdi; pop %%rsi;" + "pop %%rdx; pop %%rbx; pop %%rax \n\t" #else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" + "xchg %3, (%%esp) \n\t" + "mov %%eax, %c[rax](%3) \n\t" + "mov %%ebx, %c[rbx](%3) \n\t" + "pushl (%%esp); popl %c[rcx](%3) \n\t" + "mov %%edx, %c[rdx](%3) \n\t" + "mov %%esi, %c[rsi](%3) \n\t" + "mov %%edi, %c[rdi](%3) \n\t" + "mov %%ebp, %c[rbp](%3) \n\t" "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" + "mov %%eax, %c[cr2](%3) \n\t" + "mov (%%esp), %3 \n\t" - "pop %%ebp; pop %%ebp; pop %%edx \n\t" -#endif - "setbe %c[fail](%0) \n\t" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), + "pop %%ecx; popa \n\t" #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) - : "cc", "memory" + "setbe %0 \n\t" + : "=q" (vmx->fail) + : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), + "c"(vcpu), + [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" + [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), + [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), + [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), #endif - ); - - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); + [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) + : "cc", "memory" ); - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; intr_info = vmcs_read32(VM_EXIT_INTR_INFO); @@ -2476,6 +2332,36 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm("int $2"); } +static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, + unsigned long addr, + u32 err_code) +{ + u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + + ++vcpu->stat.pf_guest; + + if (is_page_fault(vect_info)) { + printk(KERN_DEBUG "inject_page_fault: " + "double fault 0x%lx @ 0x%lx\n", + addr, vmcs_readl(GUEST_RIP)); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + DF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + return; + } + vcpu->cr2 = addr; + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + PF_VECTOR | + INTR_TYPE_EXCEPTION | + INTR_INFO_DELIEVER_CODE_MASK | + INTR_INFO_VALID_MASK); + +} + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2511,6 +2397,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + if (irqchip_in_kernel(kvm)) { + err = kvm_create_lapic(&vmx->vcpu); + if (err < 0) + goto free_vcpu; + } + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!vmx->guest_msrs) { err = -ENOMEM; @@ -2572,7 +2464,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .check_processor_compatibility = vmx_check_processor_compat, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -2608,6 +2499,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .inject_page_fault = vmx_inject_page_fault, + + .inject_gp = vmx_inject_gp, .run = vmx_vcpu_run, .handle_exit = kvm_handle_exit, @@ -2615,12 +2509,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .patch_hypercall = vmx_patch_hypercall, .get_irq = vmx_get_irq, .set_irq = vmx_inject_irq, - .queue_exception = vmx_queue_exception, - .exception_injected = vmx_exception_injected, .inject_pending_irq = vmx_intr_assist, .inject_pending_vectors = do_interrupt_requests, - - .set_tss_addr = vmx_set_tss_addr, }; static int __init vmx_init(void) @@ -2651,13 +2541,10 @@ static int __init vmx_init(void) memset(iova, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) goto out1; - if (bypass_guest_pf) - kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - return 0; out1: @@ -2672,7 +2559,7 @@ static void __exit vmx_exit(void) __free_page(vmx_io_bitmap_b); __free_page(vmx_io_bitmap_a); - kvm_exit(); + kvm_exit_x86(); } module_init(vmx_init) diff --git a/trunk/arch/x86/kvm/vmx.h b/trunk/drivers/kvm/vmx.h similarity index 96% rename from trunk/arch/x86/kvm/vmx.h rename to trunk/drivers/kvm/vmx.h index d52ae8d7303d..fd4e14666088 100644 --- a/trunk/arch/x86/kvm/vmx.h +++ b/trunk/drivers/kvm/vmx.h @@ -25,9 +25,6 @@ * */ -/* - * Definitions of Primary Processor-Based VM-Execution Controls. - */ #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 @@ -45,12 +42,6 @@ #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 -/* - * Definitions of Secondary Processor-Based VM-Execution Controls. - */ -#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 -#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 - #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -63,6 +54,8 @@ #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 + /* VMCS Encodings */ enum vmcs_field { GUEST_ES_SELECTOR = 0x00000800, @@ -96,8 +89,6 @@ enum vmcs_field { TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, - APIC_ACCESS_ADDR = 0x00002014, - APIC_ACCESS_ADDR_HIGH = 0x00002015, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, @@ -223,8 +214,6 @@ enum vmcs_field { #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_WBINVD 54 /* * Interruption-information format @@ -241,14 +230,13 @@ enum vmcs_field { #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ /* * Exit Qualifications for MOV for Control Register Access */ -#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ -#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ #define LMSW_SOURCE_DATA_SHIFT 16 #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ #define REG_EAX (0 << 8) @@ -271,11 +259,11 @@ enum vmcs_field { /* * Exit Qualifications for MOV for Debug Register Access */ -#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ #define TYPE_MOV_TO_DR (0 << 4) #define TYPE_MOV_FROM_DR (1 << 4) -#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ +#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ /* segment AR */ @@ -319,6 +307,4 @@ enum vmcs_field { #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 - #endif diff --git a/trunk/drivers/kvm/x86_emulate.c b/trunk/drivers/kvm/x86_emulate.c new file mode 100644 index 000000000000..bd46de6bf891 --- /dev/null +++ b/trunk/drivers/kvm/x86_emulate.c @@ -0,0 +1,1662 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include +#include +#include +#define DPRINTF(_f, _a ...) printf( _f , ## _a ) +#else +#include "kvm.h" +#define DPRINTF(x...) do {} while (0) +#endif +#include "x86_emulate.h" +#include + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov ,' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ +#define SrcImm (5<<3) /* Immediate operand. */ +#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) +#define BitOp (1<<8) + +static u8 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + SrcImmByte, SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x40 - 0x4F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x50 - 0x57 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x58 - 0x5F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x60 - 0x67 */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + 0, 0, ImplicitOps|Mov, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x78 - 0x7F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x80 - 0x87 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, + ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, + ByteOp | ImplicitOps, ImplicitOps, + /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE8 - 0xEF */ + ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps, 0, + ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + /* 0xF8 - 0xFF */ + 0, 0, 0, 0, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM +}; + +static u16 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ + "push %"_sav"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + /* _sav &= ~msk; */ \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",%"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, \ + _eflags, _qx, _qy); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK) ); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + unsigned long _tmp; \ + \ + switch ( (_dst).bytes ) \ + { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"b %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"w %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"l %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ + } while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(CONFIG_X86_64) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","4","2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0","4","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ + } while (0) + +#define __emulate_1op_8byte(_op, _dst, _eflags) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0","3","2") \ + _op"q %1; " \ + _POST_EFLAGS("0","3","2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK) ); \ + } while (0) + +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ + (_size), ctxt->vcpu); \ + if ( rc != 0 ) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +/* Access/update address held in a register, based on addressing mode. */ +#define address_mask(reg) \ + ((ad_bytes == sizeof(unsigned long)) ? \ + (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) +#define register_address(base, reg) \ + ((base) + address_mask(reg)) +#define register_address_increment(reg, inc) \ + do { \ + /* signed type ensures sign extension to long */ \ + int _inc = (inc); \ + if ( ad_bytes == sizeof(unsigned long) ) \ + (reg) += _inc; \ + else \ + (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ + (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ + } while (0) + +#define JMP_REL(rel) \ + do { \ + register_address_increment(_eip, rel); \ + } while (0) + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +int +x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + unsigned d; + u8 b, sib, twobyte = 0, rex_prefix = 0; + u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; + unsigned long *override_base = NULL; + unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; + int rc = 0; + struct operand src, dst; + unsigned long cr2 = ctxt->cr2; + int mode = ctxt->mode; + unsigned long modrm_ea; + int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int no_wb = 0; + u64 msr_data; + + /* Shadow copy of register state. Committed on successful emulation. */ + unsigned long _regs[NR_VCPU_REGS]; + unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; + unsigned long modrm_val = 0; + + memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + op_bytes = ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + op_bytes = ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + op_bytes = 4; + ad_bytes = 8; + break; +#endif + default: + return -1; + } + + /* Legacy prefixes. */ + for (i = 0; i < 8; i++) { + switch (b = insn_fetch(u8, 1, _eip)) { + case 0x66: /* operand-size override */ + op_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + ad_bytes ^= 12; /* switch between 4/8 bytes */ + else + ad_bytes ^= 6; /* switch between 2/4 bytes */ + break; + case 0x2e: /* CS override */ + override_base = &ctxt->cs_base; + break; + case 0x3e: /* DS override */ + override_base = &ctxt->ds_base; + break; + case 0x26: /* ES override */ + override_base = &ctxt->es_base; + break; + case 0x64: /* FS override */ + override_base = &ctxt->fs_base; + break; + case 0x65: /* GS override */ + override_base = &ctxt->gs_base; + break; + case 0x36: /* SS override */ + override_base = &ctxt->ss_base; + break; + case 0xf0: /* LOCK */ + lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP/REPE/REPZ */ + rep_prefix = 1; + break; + default: + goto done_prefixes; + } + } + +done_prefixes: + + /* REX prefix. */ + if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { + rex_prefix = b; + if (b & 8) + op_bytes = 8; /* REX.W */ + modrm_reg = (b & 4) << 1; /* REX.R */ + index_reg = (b & 2) << 2; /* REX.X */ + modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ + b = insn_fetch(u8, 1, _eip); + } + + /* Opcode byte(s). */ + d = opcode_table[b]; + if (d == 0) { + /* Two-byte opcode? */ + if (b == 0x0f) { + twobyte = 1; + b = insn_fetch(u8, 1, _eip); + d = twobyte_table[b]; + } + + /* Unrecognised? */ + if (d == 0) + goto cannot_emulate; + } + + /* ModRM and SIB bytes. */ + if (d & ModRM) { + modrm = insn_fetch(u8, 1, _eip); + modrm_mod |= (modrm & 0xc0) >> 6; + modrm_reg |= (modrm & 0x38) >> 3; + modrm_rm |= (modrm & 0x07); + modrm_ea = 0; + use_modrm_ea = 1; + + if (modrm_mod == 3) { + modrm_val = *(unsigned long *) + decode_register(modrm_rm, _regs, d & ByteOp); + goto modrm_done; + } + + if (ad_bytes == 2) { + unsigned bx = _regs[VCPU_REGS_RBX]; + unsigned bp = _regs[VCPU_REGS_RBP]; + unsigned si = _regs[VCPU_REGS_RSI]; + unsigned di = _regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (modrm_mod) { + case 0: + if (modrm_rm == 6) + modrm_ea += insn_fetch(u16, 2, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(u16, 2, _eip); + break; + } + switch (modrm_rm) { + case 0: + modrm_ea += bx + si; + break; + case 1: + modrm_ea += bx + di; + break; + case 2: + modrm_ea += bp + si; + break; + case 3: + modrm_ea += bp + di; + break; + case 4: + modrm_ea += si; + break; + case 5: + modrm_ea += di; + break; + case 6: + if (modrm_mod != 0) + modrm_ea += bp; + break; + case 7: + modrm_ea += bx; + break; + } + if (modrm_rm == 2 || modrm_rm == 3 || + (modrm_rm == 6 && modrm_mod != 0)) + if (!override_base) + override_base = &ctxt->ss_base; + modrm_ea = (u16)modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + switch (modrm_rm) { + case 4: + case 12: + sib = insn_fetch(u8, 1, _eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + switch (base_reg) { + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[base_reg]; + else + modrm_ea += insn_fetch(s32, 4, _eip); + break; + default: + modrm_ea += _regs[base_reg]; + } + switch (index_reg) { + case 4: + break; + default: + modrm_ea += _regs[index_reg] << scale; + + } + break; + case 5: + if (modrm_mod != 0) + modrm_ea += _regs[modrm_rm]; + else if (mode == X86EMUL_MODE_PROT64) + rip_relative = 1; + break; + default: + modrm_ea += _regs[modrm_rm]; + break; + } + switch (modrm_mod) { + case 0: + if (modrm_rm == 5) + modrm_ea += insn_fetch(s32, 4, _eip); + break; + case 1: + modrm_ea += insn_fetch(s8, 1, _eip); + break; + case 2: + modrm_ea += insn_fetch(s32, 4, _eip); + break; + } + } + if (!override_base) + override_base = &ctxt->ds_base; + if (mode == X86EMUL_MODE_PROT64 && + override_base != &ctxt->fs_base && + override_base != &ctxt->gs_base) + override_base = NULL; + + if (override_base) + modrm_ea += *override_base; + + if (rip_relative) { + modrm_ea += _eip; + switch (d & SrcMask) { + case SrcImmByte: + modrm_ea += 1; + break; + case SrcImm: + if (d & ByteOp) + modrm_ea += 1; + else + if (op_bytes == 8) + modrm_ea += 4; + else + modrm_ea += op_bytes; + } + } + if (ad_bytes != 8) + modrm_ea = (u32)modrm_ea; + cr2 = modrm_ea; + modrm_done: + ; + } + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (d & SrcMask) { + case SrcNone: + break; + case SrcReg: + src.type = OP_REG; + if (d & ByteOp) { + src.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + src.val = src.orig_val = *(u8 *) src.ptr; + src.bytes = 1; + } else { + src.ptr = decode_register(modrm_reg, _regs, 0); + switch ((src.bytes = op_bytes)) { + case 2: + src.val = src.orig_val = *(u16 *) src.ptr; + break; + case 4: + src.val = src.orig_val = *(u32 *) src.ptr; + break; + case 8: + src.val = src.orig_val = *(u64 *) src.ptr; + break; + } + } + break; + case SrcMem16: + src.bytes = 2; + goto srcmem_common; + case SrcMem32: + src.bytes = 4; + goto srcmem_common; + case SrcMem: + src.bytes = (d & ByteOp) ? 1 : op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (twobyte && b == 0x01 && modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + src.type = OP_REG; + break; + } + src.type = OP_MEM; + src.ptr = (unsigned long *)cr2; + src.val = 0; + if ((rc = ops->read_emulated((unsigned long)src.ptr, + &src.val, src.bytes, ctxt->vcpu)) != 0) + goto done; + src.orig_val = src.val; + break; + case SrcImm: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + break; + case SrcImmByte: + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = 1; + src.val = insn_fetch(s8, 1, _eip); + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + goto special_insn; + case DstReg: + dst.type = OP_REG; + if ((d & ByteOp) + && !(twobyte && (b == 0xb6 || b == 0xb7))) { + dst.ptr = decode_register(modrm_reg, _regs, + (rex_prefix == 0)); + dst.val = *(u8 *) dst.ptr; + dst.bytes = 1; + } else { + dst.ptr = decode_register(modrm_reg, _regs, 0); + switch ((dst.bytes = op_bytes)) { + case 2: + dst.val = *(u16 *)dst.ptr; + break; + case 4: + dst.val = *(u32 *)dst.ptr; + break; + case 8: + dst.val = *(u64 *)dst.ptr; + break; + } + } + break; + case DstMem: + dst.type = OP_MEM; + dst.ptr = (unsigned long *)cr2; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.val = 0; + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + dst.type = OP_REG; + break; + } + if (d & BitOp) { + unsigned long mask = ~(dst.bytes * 8 - 1); + + dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; + } + if (!(d & Mov) && /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)dst.ptr, + &dst.val, dst.bytes, ctxt->vcpu)) != 0)) + goto done; + break; + } + dst.orig_val = dst.val; + + if (twobyte) + goto twobyte_insn; + + switch (b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", src, dst, _eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", src, dst, _eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", src, dst, _eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", src, dst, _eflags); + break; + case 0x20 ... 0x23: + and: /* and */ + emulate_2op_SrcV("and", src, dst, _eflags); + break; + case 0x24: /* and al imm8 */ + dst.type = OP_REG; + dst.ptr = &_regs[VCPU_REGS_RAX]; + dst.val = *(u8 *)dst.ptr; + dst.bytes = 1; + dst.orig_val = dst.val; + goto and; + case 0x25: /* and ax imm16, or eax imm32 */ + dst.type = OP_REG; + dst.bytes = op_bytes; + dst.ptr = &_regs[VCPU_REGS_RAX]; + if (op_bytes == 2) + dst.val = *(u16 *)dst.ptr; + else + dst.val = *(u32 *)dst.ptr; + dst.orig_val = dst.val; + goto and; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", src, dst, _eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", src, dst, _eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", src, dst, _eflags); + break; + case 0x63: /* movsxd */ + if (mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + dst.val = (s32) src.val; + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + test: /* test */ + emulate_2op_SrcV("test", src, dst, _eflags); + break; + case 0x86 ... 0x87: /* xchg */ + /* Write back the register source. */ + switch (dst.bytes) { + case 1: + *(u8 *) src.ptr = (u8) dst.val; + break; + case 2: + *(u16 *) src.ptr = (u16) dst.val; + break; + case 4: + *src.ptr = (u32) dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *src.ptr = dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + dst.val = src.val; + lock_prefix = 1; + break; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8d: /* lea r16/r32, m */ + dst.val = modrm_val; + break; + case 0x8f: /* pop (sole member of Grp1a) */ + /* 64-bit mode: POP always pops a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) + dst.bytes = 8; + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + &dst.val, dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); + break; + case 0xa0 ... 0xa1: /* mov */ + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + dst.val = src.val; + _eip += ad_bytes; /* skip src displacement */ + break; + case 0xa2 ... 0xa3: /* mov */ + dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; + _eip += ad_bytes; /* skip dst displacement */ + break; + case 0xc0 ... 0xc1: + grp2: /* Grp2 */ + switch (modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", src, dst, _eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", src, dst, _eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", src, dst, _eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", src, dst, _eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", src, dst, _eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", src, dst, _eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", src, dst, _eflags); + break; + } + break; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + dst.val = src.val; + break; + case 0xd0 ... 0xd1: /* Grp2 */ + src.val = 1; + goto grp2; + case 0xd2 ... 0xd3: /* Grp2 */ + src.val = _regs[VCPU_REGS_RCX]; + goto grp2; + case 0xf6 ... 0xf7: /* Grp3 */ + switch (modrm_reg) { + case 0 ... 1: /* test */ + /* + * Special case in Grp3: test has an immediate + * source operand. + */ + src.type = OP_IMM; + src.ptr = (unsigned long *)_eip; + src.bytes = (d & ByteOp) ? 1 : op_bytes; + if (src.bytes == 8) + src.bytes = 4; + switch (src.bytes) { + case 1: + src.val = insn_fetch(s8, 1, _eip); + break; + case 2: + src.val = insn_fetch(s16, 2, _eip); + break; + case 4: + src.val = insn_fetch(s32, 4, _eip); + break; + } + goto test; + case 2: /* not */ + dst.val = ~dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", dst, _eflags); + break; + default: + goto cannot_emulate; + } + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + switch (modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", dst, _eflags); + break; + case 1: /* dec */ + emulate_1op("dec", dst, _eflags); + break; + case 4: /* jmp abs */ + if (b == 0xff) + _eip = dst.val; + else + goto cannot_emulate; + break; + case 6: /* push */ + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + if (mode == X86EMUL_MODE_PROT64) { + dst.bytes = 8; + if ((rc = ops->read_std((unsigned long)dst.ptr, + &dst.val, 8, + ctxt->vcpu)) != 0) + goto done; + } + register_address_increment(_regs[VCPU_REGS_RSP], + -dst.bytes); + if ((rc = ops->write_emulated( + register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), + &dst.val, dst.bytes, ctxt->vcpu)) != 0) + goto done; + no_wb = 1; + break; + default: + goto cannot_emulate; + } + break; + } + +writeback: + if (!no_wb) { + switch (dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (dst.bytes) { + case 1: + *(u8 *)dst.ptr = (u8)dst.val; + break; + case 2: + *(u16 *)dst.ptr = (u16)dst.val; + break; + case 4: + *dst.ptr = (u32)dst.val; + break; /* 64b: zero-ext */ + case 8: + *dst.ptr = dst.val; + break; + } + break; + case OP_MEM: + if (lock_prefix) + rc = ops->cmpxchg_emulated((unsigned long)dst. + ptr, &dst.orig_val, + &dst.val, dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated((unsigned long)dst.ptr, + &dst.val, dst.bytes, + ctxt->vcpu); + if (rc != 0) + goto done; + default: + break; + } + } + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); + ctxt->eflags = _eflags; + ctxt->vcpu->rip = _eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + +special_insn: + if (twobyte) + goto twobyte_special_insn; + switch(b) { + case 0x50 ... 0x57: /* push reg */ + if (op_bytes == 2) + src.val = (u16) _regs[b & 0x7]; + else + src.val = (u32) _regs[b & 0x7]; + dst.type = OP_MEM; + dst.bytes = op_bytes; + dst.val = src.val; + register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); + dst.ptr = (void *) register_address( + ctxt->ss_base, _regs[VCPU_REGS_RSP]); + break; + case 0x58 ... 0x5f: /* pop reg */ + dst.ptr = (unsigned long *)&_regs[b & 0x7]; + pop_instruction: + if ((rc = ops->read_std(register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) + != 0) + goto done; + + register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); + no_wb = 1; /* Disable writeback. */ + break; + case 0x6a: /* push imm8 */ + src.val = 0L; + src.val = insn_fetch(s8, 1, _eip); + push: + dst.type = OP_MEM; + dst.bytes = op_bytes; + dst.val = src.val; + register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); + dst.ptr = (void *) register_address(ctxt->ss_base, + _regs[VCPU_REGS_RSP]); + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, /* in */ + (d & ByteOp) ? 1 : op_bytes, /* size */ + rep_prefix ? + address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ + (_eflags & EFLG_DF), /* down */ + register_address(ctxt->es_base, + _regs[VCPU_REGS_RDI]), /* address */ + rep_prefix, + _regs[VCPU_REGS_RDX] /* port */ + ) == 0) + return -1; + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, /* in */ + (d & ByteOp) ? 1 : op_bytes, /* size */ + rep_prefix ? + address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ + (_eflags & EFLG_DF), /* down */ + register_address(override_base ? + *override_base : ctxt->ds_base, + _regs[VCPU_REGS_RSI]), /* address */ + rep_prefix, + _regs[VCPU_REGS_RDX] /* port */ + ) == 0) + return -1; + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ { + int rel = insn_fetch(s8, 1, _eip); + + if (test_cc(b, _eflags)) + JMP_REL(rel); + break; + } + case 0x9c: /* pushf */ + src.val = (unsigned long) _eflags; + goto push; + case 0x9d: /* popf */ + dst.ptr = (unsigned long *) &_eflags; + goto pop_instruction; + case 0xc3: /* ret */ + dst.ptr = &_eip; + goto pop_instruction; + case 0xf4: /* hlt */ + ctxt->vcpu->halt_request = 1; + goto done; + } + if (rep_prefix) { + if (_regs[VCPU_REGS_RCX] == 0) { + ctxt->vcpu->rip = _eip; + goto done; + } + _regs[VCPU_REGS_RCX]--; + _eip = ctxt->vcpu->rip; + } + switch (b) { + case 0xa4 ... 0xa5: /* movs */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)register_address(ctxt->es_base, + _regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address( + override_base ? *override_base : ctxt->ds_base, + _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + DPRINTF("Urk! I don't handle CMPS.\n"); + goto cannot_emulate; + case 0xaa ... 0xab: /* stos */ + dst.type = OP_MEM; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)cr2; + dst.val = _regs[VCPU_REGS_RAX]; + register_address_increment(_regs[VCPU_REGS_RDI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + dst.type = OP_REG; + dst.bytes = (d & ByteOp) ? 1 : op_bytes; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, + ctxt->vcpu)) != 0) + goto done; + register_address_increment(_regs[VCPU_REGS_RSI], + (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xe8: /* call (near) */ { + long int rel; + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + src.val = (unsigned long) _eip; + JMP_REL(rel); + op_bytes = ad_bytes; + goto push; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(src.val); + no_wb = 1; /* Disable writeback. */ + break; + + + } + goto writeback; + +twobyte_insn: + switch (b) { + case 0x01: /* lgdt, lidt, lmsw */ + /* Disable writeback. */ + no_wb = 1; + switch (modrm_reg) { + u16 size; + unsigned long address; + + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + break; + case 3: /* lidt */ + rc = read_descriptor(ctxt, ops, src.ptr, + &size, &address, op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + break; + case 4: /* smsw */ + if (modrm_mod != 3) + goto cannot_emulate; + *(u16 *)&_regs[modrm_rm] + = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, cr2); + break; + default: + goto cannot_emulate; + } + break; + case 0x21: /* mov from dr to reg */ + no_wb = 1; + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); + break; + case 0x23: /* mov from reg to dr */ + no_wb = 1; + if (modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); + break; + case 0x40 ... 0x4f: /* cmov */ + dst.val = dst.orig_val = src.val; + no_wb = 1; + /* + * First, assume we're decoding an even cmov opcode + * (lsb == 0). + */ + switch ((b & 15) >> 1) { + case 0: /* cmovo */ + no_wb = (_eflags & EFLG_OF) ? 0 : 1; + break; + case 1: /* cmovb/cmovc/cmovnae */ + no_wb = (_eflags & EFLG_CF) ? 0 : 1; + break; + case 2: /* cmovz/cmove */ + no_wb = (_eflags & EFLG_ZF) ? 0 : 1; + break; + case 3: /* cmovbe/cmovna */ + no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; + break; + case 4: /* cmovs */ + no_wb = (_eflags & EFLG_SF) ? 0 : 1; + break; + case 5: /* cmovp/cmovpe */ + no_wb = (_eflags & EFLG_PF) ? 0 : 1; + break; + case 7: /* cmovle/cmovng */ + no_wb = (_eflags & EFLG_ZF) ? 0 : 1; + /* fall through */ + case 6: /* cmovl/cmovnge */ + no_wb &= (!(_eflags & EFLG_SF) != + !(_eflags & EFLG_OF)) ? 0 : 1; + break; + } + /* Odd cmov opcodes (lsb == 1) have inverted sense. */ + no_wb ^= b & 1; + break; + case 0xa3: + bt: /* bt */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); + break; + case 0xab: + bts: /* bts */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + src.orig_val = src.val; + src.val = _regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", src, dst, _eflags); + if (_eflags & EFLG_ZF) { + /* Success: write back to memory. */ + dst.val = src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + dst.type = OP_REG; + dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; + } + break; + case 0xb3: + btr: /* btr */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; + break; + case 0xba: /* Grp8 */ + switch (modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + src.val &= (dst.bytes << 3) - 1; /* only subword offset */ + emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); + break; + case 0xbe ... 0xbf: /* movsx */ + dst.bytes = op_bytes; + dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; + break; + case 0xc3: /* movnti */ + dst.bytes = op_bytes; + dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; + break; + } + goto writeback; + +twobyte_special_insn: + /* Disable writeback. */ + no_wb = 1; + switch (b) { + case 0x06: + emulate_clts(ctxt->vcpu); + break; + case 0x08: /* invd */ + break; + case 0x09: /* wbinvd */ + break; + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + break; + case 0x20: /* mov cr, reg */ + if (modrm_mod != 3) + goto cannot_emulate; + _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); + break; + case 0x22: /* mov reg, cr */ + if (modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)_regs[VCPU_REGS_RAX] + | ((u64)_regs[VCPU_REGS_RDX] << 32); + rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_x86_ops->inject_gp(ctxt->vcpu, 0); + _eip = ctxt->vcpu->rip; + } + rc = X86EMUL_CONTINUE; + break; + case 0x32: + /* rdmsr */ + rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_x86_ops->inject_gp(ctxt->vcpu, 0); + _eip = ctxt->vcpu->rip; + } else { + _regs[VCPU_REGS_RAX] = (u32)msr_data; + _regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ { + long int rel; + + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("jnz: Invalid op_bytes\n"); + goto cannot_emulate; + } + if (test_cc(b, _eflags)) + JMP_REL(rel); + break; + } + case 0xc7: /* Grp9 (cmpxchg8b) */ + { + u64 old, new; + if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) + != 0) + goto done; + if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { + _regs[VCPU_REGS_RAX] = (u32) (old >> 0); + _regs[VCPU_REGS_RDX] = (u32) (old >> 32); + _eflags &= ~EFLG_ZF; + } else { + new = ((u64)_regs[VCPU_REGS_RCX] << 32) + | (u32) _regs[VCPU_REGS_RBX]; + if ((rc = ops->cmpxchg_emulated(cr2, &old, + &new, 8, ctxt->vcpu)) != 0) + goto done; + _eflags |= EFLG_ZF; + } + break; + } + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", b); + return -1; +} + +#ifdef __XEN__ + +#include +#include + +int +x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + *val = 0; + + if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, 0); /* read fault */ + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +int +x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, struct x86_emulate_ctxt *ctxt) +{ + unsigned int rc; + + if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { + propagate_page_fault(addr + bytes - rc, PGERR_write_access); + return X86EMUL_PROPAGATE_FAULT; + } + + return X86EMUL_CONTINUE; +} + +#endif diff --git a/trunk/include/asm-x86/kvm_x86_emulate.h b/trunk/drivers/kvm/x86_emulate.h similarity index 83% rename from trunk/include/asm-x86/kvm_x86_emulate.h rename to trunk/drivers/kvm/x86_emulate.h index 7db91b9bdcd4..92c73aa7f9ac 100644 --- a/trunk/include/asm-x86/kvm_x86_emulate.h +++ b/trunk/drivers/kvm/x86_emulate.h @@ -62,6 +62,17 @@ struct x86_emulate_ops { int (*read_std)(unsigned long addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu); + /* + * write_std: Write bytes of standard (non-emulated/special) memory. + * Used for stack operations, and others. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_std)(unsigned long addr, const void *val, + unsigned int bytes, struct kvm_vcpu *vcpu); + /* * read_emulated: Read bytes from emulated/special memory area. * @addr: [IN ] Linear address from which to read. @@ -101,50 +112,13 @@ struct x86_emulate_ops { }; -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; -}; - -struct fetch_cache { - u8 data[15]; - unsigned long start; - unsigned long end; -}; - -struct decode_cache { - u8 twobyte; - u8 b; - u8 lock_prefix; - u8 rep_prefix; - u8 op_bytes; - u8 ad_bytes; - u8 rex_prefix; - struct operand src; - struct operand dst; - unsigned long *override_base; - unsigned int d; - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; - /* modrm */ - u8 modrm; - u8 modrm_mod; - u8 modrm_reg; - u8 modrm_rm; - u8 use_modrm_ea; - unsigned long modrm_ea; - unsigned long modrm_val; - struct fetch_cache fetch; -}; - struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; /* Linear faulting address (if emulating a page-faulting instruction). */ unsigned long eflags; + unsigned long cr2; /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; @@ -155,16 +129,8 @@ struct x86_emulate_ctxt { unsigned long ss_base; unsigned long gs_base; unsigned long fs_base; - - /* decode cache */ - - struct decode_cache decode; }; -/* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 - /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ @@ -178,9 +144,12 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); +/* + * x86_emulate_memop: Emulate an instruction that faulted attempting to + * read/write a 'special' memory area. + * Returns -1 on failure, 0 on success. + */ +int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); #endif /* __X86_EMULATE_H__ */ diff --git a/trunk/drivers/lguest/core.c b/trunk/drivers/lguest/core.c index 7743d73768df..cb4c67025d52 100644 --- a/trunk/drivers/lguest/core.c +++ b/trunk/drivers/lguest/core.c @@ -151,43 +151,43 @@ int lguest_address_ok(const struct lguest *lg, /* This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random * value (all zeroes) instead of needing to return an error. */ -void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) +void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) { - if (!lguest_address_ok(cpu->lg, addr, bytes) - || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) { + if (!lguest_address_ok(lg, addr, bytes) + || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { /* copy_from_user should do this, but as we rely on it... */ memset(b, 0, bytes); - kill_guest(cpu, "bad read address %#lx len %u", addr, bytes); + kill_guest(lg, "bad read address %#lx len %u", addr, bytes); } } /* This is the write (copy into guest) version. */ -void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, +void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, unsigned bytes) { - if (!lguest_address_ok(cpu->lg, addr, bytes) - || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0) - kill_guest(cpu, "bad write address %#lx len %u", addr, bytes); + if (!lguest_address_ok(lg, addr, bytes) + || copy_to_user(lg->mem_base + addr, b, bytes) != 0) + kill_guest(lg, "bad write address %#lx len %u", addr, bytes); } /*:*/ /*H:030 Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep * going around and around until something interesting happens. */ -int run_guest(struct lg_cpu *cpu, unsigned long __user *user) +int run_guest(struct lguest *lg, unsigned long __user *user) { /* We stop running once the Guest is dead. */ - while (!cpu->lg->dead) { + while (!lg->dead) { /* First we run any hypercalls the Guest wants done. */ - if (cpu->hcall) - do_hypercalls(cpu); + if (lg->hcall) + do_hypercalls(lg); /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ - if (cpu->pending_notify) { - if (put_user(cpu->pending_notify, user)) + if (lg->pending_notify) { + if (put_user(lg->pending_notify, user)) return -EFAULT; - return sizeof(cpu->pending_notify); + return sizeof(lg->pending_notify); } /* Check for signals */ @@ -195,13 +195,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) return -ERESTARTSYS; /* If Waker set break_out, return to Launcher. */ - if (cpu->break_out) + if (lg->break_out) return -EAGAIN; /* Check if there are any interrupts which can be delivered * now: if so, this sets up the hander to be executed when we * next run the Guest. */ - maybe_do_interrupt(cpu); + maybe_do_interrupt(lg); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -210,12 +210,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* Just make absolutely sure the Guest is still alive. One of * those hypercalls could have been fatal, for example. */ - if (cpu->lg->dead) + if (lg->dead) break; /* If the Guest asked to be stopped, we sleep. The Guest's * clock timer or LHCALL_BREAK from the Waker will wake us. */ - if (cpu->halted) { + if (lg->halted) { set_current_state(TASK_INTERRUPTIBLE); schedule(); continue; @@ -226,17 +226,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) local_irq_disable(); /* Actually run the Guest until something happens. */ - lguest_arch_run_guest(cpu); + lguest_arch_run_guest(lg); /* Now we're ready to be interrupted or moved to other CPUs */ local_irq_enable(); /* Now we deal with whatever happened to the Guest. */ - lguest_arch_handle_trap(cpu); + lguest_arch_handle_trap(lg); } - if (cpu->lg->dead == ERR_PTR(-ERESTART)) - return -ERESTART; /* The Guest is dead => "No such file or directory" */ return -ENOENT; } @@ -255,7 +253,7 @@ static int __init init(void) /* Lguest can't run under Xen, VMI or itself. It does Tricky Stuff. */ if (paravirt_enabled()) { - printk("lguest is afraid of being a guest\n"); + printk("lguest is afraid of %s\n", pv_info.name); return -EPERM; } diff --git a/trunk/drivers/lguest/hypercalls.c b/trunk/drivers/lguest/hypercalls.c index 0f2cb4fd7c69..b478affe8f91 100644 --- a/trunk/drivers/lguest/hypercalls.c +++ b/trunk/drivers/lguest/hypercalls.c @@ -23,14 +23,13 @@ #include #include #include -#include #include #include #include "lg.h" /*H:120 This is the core hypercall routine: where the Guest gets what it wants. * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ -static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) +static void do_hcall(struct lguest *lg, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: @@ -40,62 +39,60 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ - kill_guest(cpu, "already have lguest_data"); + kill_guest(lg, "already have lguest_data"); break; - case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four + case LHCALL_CRASH: { + /* Crash is such a trivial hypercall that we do it in four * lines right here. */ char msg[128]; /* If the lgread fails, it will call kill_guest() itself; the * kill_guest() with the message will be ignored. */ - __lgread(cpu, msg, args->arg1, sizeof(msg)); + __lgread(lg, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; - kill_guest(cpu, "CRASH: %s", msg); - if (args->arg2 == LGUEST_SHUTDOWN_RESTART) - cpu->lg->dead = ERR_PTR(-ERESTART); + kill_guest(lg, "CRASH: %s", msg); break; } case LHCALL_FLUSH_TLB: /* FLUSH_TLB comes in two flavors, depending on the * argument: */ if (args->arg1) - guest_pagetable_clear_all(cpu); + guest_pagetable_clear_all(lg); else - guest_pagetable_flush_user(cpu); + guest_pagetable_flush_user(lg); break; /* All these calls simply pass the arguments through to the right * routines. */ case LHCALL_NEW_PGTABLE: - guest_new_pagetable(cpu, args->arg1); + guest_new_pagetable(lg, args->arg1); break; case LHCALL_SET_STACK: - guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); + guest_set_stack(lg, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: - guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); + guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); break; case LHCALL_SET_PMD: - guest_set_pmd(cpu->lg, args->arg1, args->arg2); + guest_set_pmd(lg, args->arg1, args->arg2); break; case LHCALL_SET_CLOCKEVENT: - guest_set_clockevent(cpu, args->arg1); + guest_set_clockevent(lg, args->arg1); break; case LHCALL_TS: /* This sets the TS flag, as we saw used in run_guest(). */ - cpu->ts = args->arg1; + lg->ts = args->arg1; break; case LHCALL_HALT: /* Similarly, this sets the halted flag for run_guest(). */ - cpu->halted = 1; + lg->halted = 1; break; case LHCALL_NOTIFY: - cpu->pending_notify = args->arg1; + lg->pending_notify = args->arg1; break; default: /* It should be an architecture-specific hypercall. */ - if (lguest_arch_do_hcall(cpu, args)) - kill_guest(cpu, "Bad hypercall %li\n", args->arg0); + if (lguest_arch_do_hcall(lg, args)) + kill_guest(lg, "Bad hypercall %li\n", args->arg0); } } /*:*/ @@ -107,13 +104,13 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before * checking for a normal hcall). */ -static void do_async_hcalls(struct lg_cpu *cpu) +static void do_async_hcalls(struct lguest *lg) { unsigned int i; u8 st[LHCALL_RING_SIZE]; /* For simplicity, we copy the entire call status array in at once. */ - if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st))) + if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) return; /* We process "struct lguest_data"s hcalls[] ring once. */ @@ -122,7 +119,7 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest * places them in the ring. */ - unsigned int n = cpu->next_hcall; + unsigned int n = lg->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) @@ -130,65 +127,65 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* OK, we have hypercall. Increment the "next_hcall" cursor, * and wrap back to 0 if we reach the end. */ - if (++cpu->next_hcall == LHCALL_RING_SIZE) - cpu->next_hcall = 0; + if (++lg->next_hcall == LHCALL_RING_SIZE) + lg->next_hcall = 0; /* Copy the hypercall arguments into a local copy of * the hcall_args struct. */ - if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], + if (copy_from_user(&args, &lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { - kill_guest(cpu, "Fetching async hypercalls"); + kill_guest(lg, "Fetching async hypercalls"); break; } /* Do the hypercall, same as a normal one. */ - do_hcall(cpu, &args); + do_hcall(lg, &args); /* Mark the hypercall done. */ - if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) { - kill_guest(cpu, "Writing result for async hypercall"); + if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { + kill_guest(lg, "Writing result for async hypercall"); break; } /* Stop doing hypercalls if they want to notify the Launcher: * it needs to service this first. */ - if (cpu->pending_notify) + if (lg->pending_notify) break; } } /* Last of all, we look at what happens first of all. The very first time the * Guest makes a hypercall, we end up here to set things up: */ -static void initialize(struct lg_cpu *cpu) +static void initialize(struct lguest *lg) { /* You can't do anything until you're initialized. The Guest knows the * rules, so we're unforgiving here. */ - if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { - kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); + if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { + kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); return; } - if (lguest_arch_init_hypercalls(cpu)) - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + if (lguest_arch_init_hypercalls(lg)) + kill_guest(lg, "bad guest page %p", lg->lguest_data); /* The Guest tells us where we're not to deliver interrupts by putting * the range of addresses into "struct lguest_data". */ - if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) - || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) + || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) + kill_guest(lg, "bad guest page %p", lg->lguest_data); /* We write the current time into the Guest's data page once so it can * set its clock. */ - write_timestamp(cpu); + write_timestamp(lg); /* page_tables.c will also do some setup. */ - page_table_guest_data_init(cpu); + page_table_guest_data_init(lg); /* This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest * pagetable. */ - guest_pagetable_clear_all(cpu); + guest_pagetable_clear_all(lg); } /*H:100 @@ -197,27 +194,27 @@ static void initialize(struct lg_cpu *cpu) * Remember from the Guest, hypercalls come in two flavors: normal and * asynchronous. This file handles both of types. */ -void do_hypercalls(struct lg_cpu *cpu) +void do_hypercalls(struct lguest *lg) { /* Not initialized yet? This hypercall must do it. */ - if (unlikely(!cpu->lg->lguest_data)) { + if (unlikely(!lg->lguest_data)) { /* Set up the "struct lguest_data" */ - initialize(cpu); + initialize(lg); /* Hcall is done. */ - cpu->hcall = NULL; + lg->hcall = NULL; return; } /* The Guest has initialized. * * Look in the hypercall ring for the async hypercalls: */ - do_async_hcalls(cpu); + do_async_hcalls(lg); /* If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do * the hypercall. */ - if (!cpu->pending_notify) { - do_hcall(cpu, cpu->hcall); + if (!lg->pending_notify) { + do_hcall(lg, lg->hcall); /* Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. @@ -228,17 +225,16 @@ void do_hypercalls(struct lg_cpu *cpu) * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the * hypercall. */ - cpu->hcall = NULL; + lg->hcall = NULL; } } /* This routine supplies the Guest with time: it's used for wallclock time at * initial boot and as a rough time source if the TSC isn't available. */ -void write_timestamp(struct lg_cpu *cpu) +void write_timestamp(struct lguest *lg) { struct timespec now; ktime_get_real_ts(&now); - if (copy_to_user(&cpu->lg->lguest_data->time, - &now, sizeof(struct timespec))) - kill_guest(cpu, "Writing timestamp"); + if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) + kill_guest(lg, "Writing timestamp"); } diff --git a/trunk/drivers/lguest/interrupts_and_traps.c b/trunk/drivers/lguest/interrupts_and_traps.c index 32e97c1858e5..2b66f79c208b 100644 --- a/trunk/drivers/lguest/interrupts_and_traps.c +++ b/trunk/drivers/lguest/interrupts_and_traps.c @@ -41,11 +41,11 @@ static int idt_present(u32 lo, u32 hi) /* We need a helper to "push" a value onto the Guest's stack, since that's a * big part of what delivering an interrupt does. */ -static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) +static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ *gstack -= 4; - lgwrite(cpu, *gstack, u32, val); + lgwrite(lg, *gstack, u32, val); } /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or @@ -60,7 +60,7 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo * it). */ -static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err) +static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) { unsigned long gstack, origstack; u32 eflags, ss, irq_enable; @@ -69,59 +69,59 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err) /* There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in * userspace. We check the privilege level to find out. */ - if ((cpu->regs->ss&0x3) != GUEST_PL) { + if ((lg->regs->ss&0x3) != GUEST_PL) { /* The Guest told us their kernel stack with the SET_STACK * hypercall: both the virtual address and the segment */ - virtstack = cpu->esp1; - ss = cpu->ss1; + virtstack = lg->esp1; + ss = lg->ss1; - origstack = gstack = guest_pa(cpu, virtstack); + origstack = gstack = guest_pa(lg, virtstack); /* We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege * levels and expect these here. */ - push_guest_stack(cpu, &gstack, cpu->regs->ss); - push_guest_stack(cpu, &gstack, cpu->regs->esp); + push_guest_stack(lg, &gstack, lg->regs->ss); + push_guest_stack(lg, &gstack, lg->regs->esp); } else { /* We're staying on the same Guest (kernel) stack. */ - virtstack = cpu->regs->esp; - ss = cpu->regs->ss; + virtstack = lg->regs->esp; + ss = lg->regs->ss; - origstack = gstack = guest_pa(cpu, virtstack); + origstack = gstack = guest_pa(lg, virtstack); } /* Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest * copy it back in "lguest_iret". */ - eflags = cpu->regs->eflags; - if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 + eflags = lg->regs->eflags; + if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; /* An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction * pointer. */ - push_guest_stack(cpu, &gstack, eflags); - push_guest_stack(cpu, &gstack, cpu->regs->cs); - push_guest_stack(cpu, &gstack, cpu->regs->eip); + push_guest_stack(lg, &gstack, eflags); + push_guest_stack(lg, &gstack, lg->regs->cs); + push_guest_stack(lg, &gstack, lg->regs->eip); /* For the six traps which supply an error code, we push that, too. */ if (has_err) - push_guest_stack(cpu, &gstack, cpu->regs->errcode); + push_guest_stack(lg, &gstack, lg->regs->errcode); /* Now we've pushed all the old state, we change the stack, the code * segment and the address to execute. */ - cpu->regs->ss = ss; - cpu->regs->esp = virtstack + (gstack - origstack); - cpu->regs->cs = (__KERNEL_CS|GUEST_PL); - cpu->regs->eip = idt_address(lo, hi); + lg->regs->ss = ss; + lg->regs->esp = virtstack + (gstack - origstack); + lg->regs->cs = (__KERNEL_CS|GUEST_PL); + lg->regs->eip = idt_address(lo, hi); /* There are two kinds of interrupt handlers: 0xE is an "interrupt * gate" which expects interrupts to be disabled on entry. */ if (idt_type(lo, hi) == 0xE) - if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) - kill_guest(cpu, "Disabling interrupts"); + if (put_user(0, &lg->lguest_data->irq_enabled)) + kill_guest(lg, "Disabling interrupts"); } /*H:205 @@ -129,23 +129,23 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, int has_err) * * maybe_do_interrupt() gets called before every entry to the Guest, to see if * we should divert the Guest to running an interrupt handler. */ -void maybe_do_interrupt(struct lg_cpu *cpu) +void maybe_do_interrupt(struct lguest *lg) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); struct desc_struct *idt; /* If the Guest hasn't even initialized yet, we can do nothing. */ - if (!cpu->lg->lguest_data) + if (!lg->lguest_data) return; /* Take our "irqs_pending" array and remove any interrupts the Guest * wants blocked: the result ends up in "blk". */ - if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, + if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts, sizeof(blk))) return; - bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); + bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); @@ -155,20 +155,19 @@ void maybe_do_interrupt(struct lg_cpu *cpu) /* They may be in the middle of an iret, where they asked us never to * deliver interrupts. */ - if (cpu->regs->eip >= cpu->lg->noirq_start && - (cpu->regs->eip < cpu->lg->noirq_end)) + if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end) return; /* If they're halted, interrupts restart them. */ - if (cpu->halted) { + if (lg->halted) { /* Re-enable interrupts. */ - if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled)) - kill_guest(cpu, "Re-enabling interrupts"); - cpu->halted = 0; + if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled)) + kill_guest(lg, "Re-enabling interrupts"); + lg->halted = 0; } else { /* Otherwise we check if they have interrupts disabled. */ u32 irq_enabled; - if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) + if (get_user(irq_enabled, &lg->lguest_data->irq_enabled)) irq_enabled = 0; if (!irq_enabled) return; @@ -177,15 +176,15 @@ void maybe_do_interrupt(struct lg_cpu *cpu) /* Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip * over them. */ - idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; + idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ - clear_bit(irq, cpu->irqs_pending); + clear_bit(irq, lg->irqs_pending); /* set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto * the stack as well: virtual interrupts never do. */ - set_guest_interrupt(cpu, idt->a, idt->b, 0); + set_guest_interrupt(lg, idt->a, idt->b, 0); } /* Every time we deliver an interrupt, we update the timestamp in the @@ -193,7 +192,7 @@ void maybe_do_interrupt(struct lg_cpu *cpu) * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every * timer interrupt. */ - write_timestamp(cpu); + write_timestamp(lg); } /*:*/ @@ -246,19 +245,19 @@ static int has_err(unsigned int trap) } /* deliver_trap() returns true if it could deliver the trap. */ -int deliver_trap(struct lg_cpu *cpu, unsigned int num) +int deliver_trap(struct lguest *lg, unsigned int num) { /* Trap numbers are always 8 bit, but we set an impossible trap number * for traps inside the Switcher, so check that here. */ - if (num >= ARRAY_SIZE(cpu->arch.idt)) + if (num >= ARRAY_SIZE(lg->arch.idt)) return 0; /* Early on the Guest hasn't set the IDT entries (or maybe it put a * bogus one in): if we fail here, the Guest will be killed. */ - if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) + if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) return 0; - set_guest_interrupt(cpu, cpu->arch.idt[num].a, - cpu->arch.idt[num].b, has_err(num)); + set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, + has_err(num)); return 1; } @@ -310,18 +309,18 @@ static int direct_trap(unsigned int num) * the Guest. * * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ -void pin_stack_pages(struct lg_cpu *cpu) +void pin_stack_pages(struct lguest *lg) { unsigned int i; /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or * two pages of stack space. */ - for (i = 0; i < cpu->lg->stack_pages; i++) + for (i = 0; i < lg->stack_pages; i++) /* The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to * get to the rest of the stack pages. */ - pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); + pin_page(lg, lg->esp1 - 1 - i * PAGE_SIZE); } /* Direct traps also mean that we need to know whenever the Guest wants to use @@ -332,21 +331,21 @@ void pin_stack_pages(struct lg_cpu *cpu) * * In Linux each process has its own kernel stack, so this happens a lot: we * change stacks on each context switch. */ -void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) +void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) { /* You are not allowed have a stack segment with privilege level 0: bad * Guest! */ if ((seg & 0x3) != GUEST_PL) - kill_guest(cpu, "bad stack segment %i", seg); + kill_guest(lg, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ if (pages > 2) - kill_guest(cpu, "bad stack pages %u", pages); + kill_guest(lg, "bad stack pages %u", pages); /* Save where the stack is, and how many pages */ - cpu->ss1 = seg; - cpu->esp1 = esp; - cpu->lg->stack_pages = pages; + lg->ss1 = seg; + lg->esp1 = esp; + lg->stack_pages = pages; /* Make sure the new stack pages are mapped */ - pin_stack_pages(cpu); + pin_stack_pages(lg); } /* All this reference to mapping stacks leads us neatly into the other complex @@ -354,7 +353,7 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) /*H:235 This is the routine which actually checks the Guest's IDT entry and * transfers it into the entry in "struct lguest": */ -static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, +static void set_trap(struct lguest *lg, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { u8 type = idt_type(lo, hi); @@ -367,7 +366,7 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, /* We only support interrupt and trap gates. */ if (type != 0xE && type != 0xF) - kill_guest(cpu, "bad IDT type %i", type); + kill_guest(lg, "bad IDT type %i", type); /* We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered @@ -384,7 +383,7 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ -void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) +void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) { /* Guest never handles: NMI, doublefault, spurious interrupt or * hypercall. We ignore when it tries to set them. */ @@ -393,13 +392,13 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) /* Mark the IDT as changed: next time the Guest runs we'll know we have * to copy this again. */ - cpu->changed |= CHANGED_IDT; + lg->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ - if (num >= ARRAY_SIZE(cpu->arch.idt)) - kill_guest(cpu, "Setting idt entry %u", num); + if (num >= ARRAY_SIZE(lg->arch.idt)) + kill_guest(lg, "Setting idt entry %u", num); else - set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); + set_trap(lg, &lg->arch.idt[num], num, lo, hi); } /* The default entry for each interrupt points into the Switcher routines which @@ -435,14 +434,14 @@ void setup_default_idt_entries(struct lguest_ro_state *state, /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just * before we run the Guest. This routine does that copy. */ -void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, +void copy_traps(const struct lguest *lg, struct desc_struct *idt, const unsigned long *def) { unsigned int i; /* We can simply copy the direct traps, otherwise we use the default * ones in the Switcher: they will return to the Host. */ - for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { + for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { /* If no Guest can ever override this trap, leave it alone. */ if (!direct_trap(i)) continue; @@ -451,8 +450,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. */ - if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF) - idt[i] = cpu->arch.idt[i]; + if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) + idt[i] = lg->arch.idt[i]; else /* Reset it to the default. */ default_idt_entry(&idt[i], i, def[i]); @@ -471,13 +470,13 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * infrastructure to set a callback at that time. * * 0 means "turn off the clock". */ -void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) +void guest_set_clockevent(struct lguest *lg, unsigned long delta) { ktime_t expires; if (unlikely(delta == 0)) { /* Clock event device is shutting down. */ - hrtimer_cancel(&cpu->hrt); + hrtimer_cancel(&lg->hrt); return; } @@ -485,25 +484,25 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) * all the time between now and the timer interrupt it asked for. This * is almost always the right thing to do. */ expires = ktime_add_ns(ktime_get_real(), delta); - hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); + hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); } /* This is the function called when the Guest's timer expires. */ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) { - struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); + struct lguest *lg = container_of(timer, struct lguest, hrt); /* Remember the first interrupt is the timer interrupt. */ - set_bit(0, cpu->irqs_pending); + set_bit(0, lg->irqs_pending); /* If the Guest is actually stopped, we need to wake it up. */ - if (cpu->halted) - wake_up_process(cpu->tsk); + if (lg->halted) + wake_up_process(lg->tsk); return HRTIMER_NORESTART; } /* This sets up the timer for this Guest. */ -void init_clockdev(struct lg_cpu *cpu) +void init_clockdev(struct lguest *lg) { - hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); - cpu->hrt.function = clockdev_fn; + hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); + lg->hrt.function = clockdev_fn; } diff --git a/trunk/drivers/lguest/lg.h b/trunk/drivers/lguest/lg.h index 2337e1a06f02..86924891b5eb 100644 --- a/trunk/drivers/lguest/lg.h +++ b/trunk/drivers/lguest/lg.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -39,72 +38,58 @@ struct lguest_pages #define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */ #define CHANGED_ALL 3 -struct lguest; - -struct lg_cpu { - unsigned int id; - struct lguest *lg; +/* The private info the thread maintains about the guest. */ +struct lguest +{ + /* At end of a page shared mapped over lguest_pages in guest. */ + unsigned long regs_page; + struct lguest_regs *regs; + struct lguest_data __user *lguest_data; struct task_struct *tsk; struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ - + u32 pfn_limit; + /* This provides the offset to the base of guest-physical + * memory in the Launcher. */ + void __user *mem_base; + unsigned long kernel_address; u32 cr2; + int halted; int ts; + u32 next_hcall; u32 esp1; u8 ss1; - /* Bitmap of what has changed: see CHANGED_* above. */ - int changed; - - unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - - /* At end of a page shared mapped over lguest_pages in guest. */ - unsigned long regs_page; - struct lguest_regs *regs; - - struct lguest_pages *last_pages; - - int cpu_pgd; /* which pgd this cpu is currently using */ - /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; - u32 next_hcall; - - /* Virtual clock device */ - struct hrtimer hrt; /* Do we need to stop what we're doing and return to userspace? */ int break_out; wait_queue_head_t break_wq; - int halted; - - /* Pending virtual interrupts */ - DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); - - struct lg_cpu_arch arch; -}; -/* The private info the thread maintains about the guest. */ -struct lguest -{ - struct lguest_data __user *lguest_data; - struct lg_cpu cpus[NR_CPUS]; - unsigned int nr_cpus; - - u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ - void __user *mem_base; - unsigned long kernel_address; + /* Bitmap of what has changed: see CHANGED_* above. */ + int changed; + struct lguest_pages *last_pages; + /* We keep a small number of these. */ + u32 pgdidx; struct pgdir pgdirs[4]; unsigned long noirq_start, noirq_end; + unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ unsigned int stack_pages; u32 tsc_khz; /* Dead? */ const char *dead; + + struct lguest_arch arch; + + /* Virtual clock device */ + struct hrtimer hrt; + + /* Pending virtual interrupts */ + DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); }; extern struct mutex lguest_lock; @@ -112,26 +97,26 @@ extern struct mutex lguest_lock; /* core.c: */ int lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); -void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); -void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); +void __lgread(struct lguest *, void *, unsigned long, unsigned); +void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); /*H:035 Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * * This reads into a variable of the given type then returns that. */ -#define lgread(cpu, addr, type) \ - ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) +#define lgread(lg, addr, type) \ + ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) /* This checks that the variable is of the given type, then writes it out. */ -#define lgwrite(cpu, addr, type, val) \ +#define lgwrite(lg, addr, type, val) \ do { \ typecheck(type, val); \ - __lgwrite((cpu), (addr), &(val), sizeof(val)); \ + __lgwrite((lg), (addr), &(val), sizeof(val)); \ } while(0) /* (end of memory access helper routines) :*/ -int run_guest(struct lg_cpu *cpu, unsigned long __user *user); +int run_guest(struct lguest *lg, unsigned long __user *user); /* Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined @@ -141,53 +126,52 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -void maybe_do_interrupt(struct lg_cpu *cpu); -int deliver_trap(struct lg_cpu *cpu, unsigned int num); -void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, - u32 low, u32 hi); -void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages); -void pin_stack_pages(struct lg_cpu *cpu); +void maybe_do_interrupt(struct lguest *lg); +int deliver_trap(struct lguest *lg, unsigned int num); +void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi); +void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages); +void pin_stack_pages(struct lguest *lg); void setup_default_idt_entries(struct lguest_ro_state *state, const unsigned long *def); -void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, +void copy_traps(const struct lguest *lg, struct desc_struct *idt, const unsigned long *def); -void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); -void init_clockdev(struct lg_cpu *cpu); +void guest_set_clockevent(struct lguest *lg, unsigned long delta); +void init_clockdev(struct lguest *lg); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); void free_interrupts(void); /* segments.c: */ void setup_default_gdt_entries(struct lguest_ro_state *state); -void setup_guest_gdt(struct lg_cpu *cpu); -void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num); -void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array); -void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt); -void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); +void setup_guest_gdt(struct lguest *lg); +void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num); +void guest_load_tls(struct lguest *lg, unsigned long tls_array); +void copy_gdt(const struct lguest *lg, struct desc_struct *gdt); +void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); /* page_tables.c: */ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); void free_guest_pagetable(struct lguest *lg); -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); +void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); -void guest_pagetable_clear_all(struct lg_cpu *cpu); -void guest_pagetable_flush_user(struct lg_cpu *cpu); -void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, +void guest_pagetable_clear_all(struct lguest *lg); +void guest_pagetable_flush_user(struct lguest *lg); +void guest_set_pte(struct lguest *lg, unsigned long gpgdir, unsigned long vaddr, pte_t val); -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); -int demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); -void pin_page(struct lg_cpu *cpu, unsigned long vaddr); -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); -void page_table_guest_data_init(struct lg_cpu *cpu); +void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); +int demand_page(struct lguest *info, unsigned long cr2, int errcode); +void pin_page(struct lguest *lg, unsigned long vaddr); +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); +void page_table_guest_data_init(struct lguest *lg); /* /core.c: */ void lguest_arch_host_init(void); void lguest_arch_host_fini(void); -void lguest_arch_run_guest(struct lg_cpu *cpu); -void lguest_arch_handle_trap(struct lg_cpu *cpu); -int lguest_arch_init_hypercalls(struct lg_cpu *cpu); -int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); -void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); +void lguest_arch_run_guest(struct lguest *lg); +void lguest_arch_handle_trap(struct lguest *lg); +int lguest_arch_init_hypercalls(struct lguest *lg); +int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); +void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); /* /switcher.S: */ extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; @@ -197,8 +181,8 @@ int lguest_device_init(void); void lguest_device_remove(void); /* hypercalls.c: */ -void do_hypercalls(struct lg_cpu *cpu); -void write_timestamp(struct lg_cpu *cpu); +void do_hypercalls(struct lguest *lg); +void write_timestamp(struct lguest *lg); /*L:035 * Let's step aside for the moment, to study one important routine that's used @@ -224,12 +208,12 @@ void write_timestamp(struct lg_cpu *cpu); * Like any macro which uses an "if", it is safely wrapped in a run-once "do { * } while(0)". */ -#define kill_guest(cpu, fmt...) \ +#define kill_guest(lg, fmt...) \ do { \ - if (!(cpu)->lg->dead) { \ - (cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt); \ - if (!(cpu)->lg->dead) \ - (cpu)->lg->dead = ERR_PTR(-ENOMEM); \ + if (!(lg)->dead) { \ + (lg)->dead = kasprintf(GFP_ATOMIC, fmt); \ + if (!(lg)->dead) \ + (lg)->dead = ERR_PTR(-ENOMEM); \ } \ } while(0) /* (End of aside) :*/ diff --git a/trunk/drivers/lguest/lguest_user.c b/trunk/drivers/lguest/lguest_user.c index 85d42d3d01a9..3b92a61ba8d2 100644 --- a/trunk/drivers/lguest/lguest_user.c +++ b/trunk/drivers/lguest/lguest_user.c @@ -6,7 +6,6 @@ #include #include #include -#include #include "lg.h" /*L:055 When something happens, the Waker process needs a way to stop the @@ -14,7 +13,7 @@ * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release * the Waker. */ -static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) +static int break_guest_out(struct lguest *lg, const unsigned long __user *input) { unsigned long on; @@ -23,21 +22,21 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) return -EFAULT; if (on) { - cpu->break_out = 1; + lg->break_out = 1; /* Pop it out of the Guest (may be running on different CPU) */ - wake_up_process(cpu->tsk); + wake_up_process(lg->tsk); /* Wait for them to reset it */ - return wait_event_interruptible(cpu->break_wq, !cpu->break_out); + return wait_event_interruptible(lg->break_wq, !lg->break_out); } else { - cpu->break_out = 0; - wake_up(&cpu->break_wq); + lg->break_out = 0; + wake_up(&lg->break_wq); return 0; } } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. */ -static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) +static int user_send_irq(struct lguest *lg, const unsigned long __user *input) { unsigned long irq; @@ -47,7 +46,7 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return -EINVAL; /* Next time the Guest runs, the core code will see if it can deliver * this interrupt. */ - set_bit(irq, cpu->irqs_pending); + set_bit(irq, lg->irqs_pending); return 0; } @@ -56,21 +55,13 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; - struct lg_cpu *cpu; - unsigned int cpu_id = *o; /* You must write LHREQ_INITIALIZE first! */ if (!lg) return -EINVAL; - /* Watch out for arbitrary vcpu indexes! */ - if (cpu_id >= lg->nr_cpus) - return -EINVAL; - - cpu = &lg->cpus[cpu_id]; - /* If you're not the task which owns the Guest, go away. */ - if (current != cpu->tsk) + if (current != lg->tsk) return -EPERM; /* If the guest is already dead, we indicate why */ @@ -90,53 +81,11 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) /* If we returned from read() last time because the Guest notified, * clear the flag. */ - if (cpu->pending_notify) - cpu->pending_notify = 0; + if (lg->pending_notify) + lg->pending_notify = 0; /* Run the Guest until something interesting happens. */ - return run_guest(cpu, (unsigned long __user *)user); -} - -static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) -{ - if (id >= NR_CPUS) - return -EINVAL; - - cpu->id = id; - cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); - cpu->lg->nr_cpus++; - init_clockdev(cpu); - - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ - cpu->regs_page = get_zeroed_page(GFP_KERNEL); - if (!cpu->regs_page) - return -ENOMEM; - - /* We actually put the registers at the bottom of the page. */ - cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - - /* Now we initialize the Guest's registers, handing it the start - * address. */ - lguest_arch_setup_regs(cpu, start_ip); - - /* Initialize the queue for the waker to wait on */ - init_waitqueue_head(&cpu->break_wq); - - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (inter-Guest I/O). */ - cpu->tsk = current; - - /* We need to keep a pointer to the Launcher's memory map, because if - * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ - cpu->mm = get_task_mm(cpu->tsk); - - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ - cpu->last_pages = NULL; - - return 0; + return run_guest(lg, (unsigned long __user *)user); } /*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) @@ -185,10 +134,15 @@ static int initialize(struct file *file, const unsigned long __user *input) lg->mem_base = (void __user *)(long)args[0]; lg->pfn_limit = args[1]; - /* This is the first cpu */ - err = lg_cpu_start(&lg->cpus[0], 0, args[3]); - if (err) + /* We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. */ + lg->regs_page = get_zeroed_page(GFP_KERNEL); + if (!lg->regs_page) { + err = -ENOMEM; goto release_guest; + } + /* We actually put the registers at the bottom of the page. */ + lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs); /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can @@ -197,6 +151,28 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_regs; + /* Now we initialize the Guest's registers, handing it the start + * address. */ + lguest_arch_setup_regs(lg, args[3]); + + /* The timer for lguest's clock needs initialization. */ + init_clockdev(lg); + + /* We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (inter-Guest I/O). */ + lg->tsk = current; + /* We need to keep a pointer to the Launcher's memory map, because if + * the Launcher dies we need to clean it up. If we don't keep a + * reference, it is destroyed before close() is called. */ + lg->mm = get_task_mm(lg->tsk); + + /* Initialize the queue for the waker to wait on */ + init_waitqueue_head(&lg->break_wq); + + /* We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. */ + lg->last_pages = NULL; + /* We keep our "struct lguest" in the file's private_data. */ file->private_data = lg; @@ -206,8 +182,7 @@ static int initialize(struct file *file, const unsigned long __user *input) return sizeof(args); free_regs: - /* FIXME: This should be in free_vcpu */ - free_page(lg->cpus[0].regs_page); + free_page(lg->regs_page); release_guest: kfree(lg); unlock: @@ -227,37 +202,30 @@ static ssize_t write(struct file *file, const char __user *in, struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; - struct lg_cpu *uninitialized_var(cpu); - unsigned int cpu_id = *off; if (get_user(req, input) != 0) return -EFAULT; input++; /* If you haven't initialized, you must do that first. */ - if (req != LHREQ_INITIALIZE) { - if (!lg || (cpu_id >= lg->nr_cpus)) - return -EINVAL; - cpu = &lg->cpus[cpu_id]; - if (!cpu) - return -EINVAL; - } + if (req != LHREQ_INITIALIZE && !lg) + return -EINVAL; /* Once the Guest is dead, all you can do is read() why it died. */ if (lg && lg->dead) return -ENOENT; /* If you're not the task which owns the Guest, you can only break */ - if (lg && current != cpu->tsk && req != LHREQ_BREAK) + if (lg && current != lg->tsk && req != LHREQ_BREAK) return -EPERM; switch (req) { case LHREQ_INITIALIZE: return initialize(file, input); case LHREQ_IRQ: - return user_send_irq(cpu, input); + return user_send_irq(lg, input); case LHREQ_BREAK: - return break_guest_out(cpu, input); + return break_guest_out(lg, input); default: return -EINVAL; } @@ -273,7 +241,6 @@ static ssize_t write(struct file *file, const char __user *in, static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; - unsigned int i; /* If we never successfully initialized, there's nothing to clean up */ if (!lg) @@ -282,23 +249,19 @@ static int close(struct inode *inode, struct file *file) /* We need the big lock, to protect from inter-guest I/O and other * Launchers initializing guests. */ mutex_lock(&lguest_lock); - + /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ + hrtimer_cancel(&lg->hrt); /* Free up the shadow page tables for the Guest. */ free_guest_pagetable(lg); - - for (i = 0; i < lg->nr_cpus; i++) { - /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ - hrtimer_cancel(&lg->cpus[i].hrt); - /* We can free up the register page we allocated. */ - free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ - mmput(lg->cpus[i].mm); - } + /* Now all the memory cleanups are done, it's safe to release the + * Launcher's memory management structure. */ + mmput(lg->mm); /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) kfree(lg->dead); + /* We can free up the register page we allocated. */ + free_page(lg->regs_page); /* We clear the entire structure, which also marks it as free for the * next user. */ memset(lg, 0, sizeof(*lg)); diff --git a/trunk/drivers/lguest/page_tables.c b/trunk/drivers/lguest/page_tables.c index 74b4cf2a6c41..fffabb327157 100644 --- a/trunk/drivers/lguest/page_tables.c +++ b/trunk/drivers/lguest/page_tables.c @@ -68,23 +68,23 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's * usually the current one). */ -static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) +static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { - kill_guest(cpu, "attempt to access switcher pages"); + kill_guest(lg, "attempt to access switcher pages"); index = 0; } /* Return a pointer index'th pgd entry for the i'th page table. */ - return &cpu->lg->pgdirs[i].pgdir[index]; + return &lg->pgdirs[i].pgdir[index]; } /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) { pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ @@ -94,13 +94,14 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) /* These two functions just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ -static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) +static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); - return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); + return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); } -static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) +static unsigned long gpte_addr(struct lguest *lg, + pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); @@ -137,7 +138,7 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page * number. */ -static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) +static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) { unsigned long pfn, base, flags; @@ -148,7 +149,7 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ - base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; + base = (unsigned long)lg->mem_base / PAGE_SIZE; /* We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't @@ -156,7 +157,7 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) * page, given the virtual number. */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { - kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); + kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); /* When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think * this one is valid! */ @@ -176,18 +177,17 @@ static void release_pte(pte_t pte) } /*:*/ -static void check_gpte(struct lg_cpu *cpu, pte_t gpte) +static void check_gpte(struct lguest *lg, pte_t gpte) { if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) - || pte_pfn(gpte) >= cpu->lg->pfn_limit) - kill_guest(cpu, "bad page table entry"); + || pte_pfn(gpte) >= lg->pfn_limit) + kill_guest(lg, "bad page table entry"); } -static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) +static void check_gpgd(struct lguest *lg, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || - (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) - kill_guest(cpu, "bad page directory entry"); + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) + kill_guest(lg, "bad page directory entry"); } /*H:330 @@ -200,7 +200,7 @@ static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. */ -int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) { pgd_t gpgd; pgd_t *spgd; @@ -209,24 +209,24 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t *spte; /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return 0; /* Now look at the matching shadow entry. */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); + spgd = spgd_addr(lg, lg->pgdidx, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* This is not really the Guest's fault, but killing it is * simple for this corner case. */ if (!ptepage) { - kill_guest(cpu, "out of memory allocating pte page"); + kill_guest(lg, "out of memory allocating pte page"); return 0; } /* We check that the Guest pgd is OK. */ - check_gpgd(cpu, gpgd); + check_gpgd(lg, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); @@ -234,8 +234,8 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(gpgd, vaddr); - gpte = lgread(cpu, gpte_ptr, pte_t); + gpte_ptr = gpte_addr(lg, gpgd, vaddr); + gpte = lgread(lg, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) @@ -252,7 +252,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ - check_gpte(cpu, gpte); + check_gpte(lg, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ gpte = pte_mkyoung(gpte); @@ -260,7 +260,7 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(*spgd, vaddr); + spte = spte_addr(lg, *spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -268,17 +268,17 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ if (pte_dirty(gpte)) - *spte = gpte_to_spte(cpu, gpte, 1); + *spte = gpte_to_spte(lg, gpte, 1); else /* If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); + *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite(cpu, gpte_ptr, pte_t, gpte); + lgwrite(lg, gpte_ptr, pte_t, gpte); /* The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small @@ -297,19 +297,19 @@ int demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * * This is a quick version which answers the question: is this virtual address * mapped by the shadow page tables, and is it writable? */ -static int page_writable(struct lg_cpu *cpu, unsigned long vaddr) +static int page_writable(struct lguest *lg, unsigned long vaddr) { pgd_t *spgd; unsigned long flags; /* Look at the current top level entry: is it present? */ - spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); + spgd = spgd_addr(lg, lg->pgdidx, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return 0; /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(*spgd, vaddr))); + flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -317,10 +317,10 @@ static int page_writable(struct lg_cpu *cpu, unsigned long vaddr) /* So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 * (meaning "write"). */ -void pin_page(struct lg_cpu *cpu, unsigned long vaddr) +void pin_page(struct lguest *lg, unsigned long vaddr) { - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) - kill_guest(cpu, "bad stack page %#lx", vaddr); + if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2)) + kill_guest(lg, "bad stack page %#lx", vaddr); } /*H:450 If we chase down the release_pgd() code, it looks like this: */ @@ -358,28 +358,28 @@ static void flush_user_mappings(struct lguest *lg, int idx) * * The Guest has a hypercall to throw away the page tables: it's used when a * large number of mappings have been changed. */ -void guest_pagetable_flush_user(struct lg_cpu *cpu) +void guest_pagetable_flush_user(struct lguest *lg) { /* Drop the userspace part of the current page table. */ - flush_user_mappings(cpu->lg, cpu->cpu_pgd); + flush_user_mappings(lg, lg->pgdidx); } /*:*/ /* We walk down the guest page tables to get a guest-physical address */ -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; /* First step: get the top-level Guest page table entry. */ - gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) - kill_guest(cpu, "Bad address %#lx", vaddr); + kill_guest(lg, "Bad address %#lx", vaddr); - gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); + gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) - kill_guest(cpu, "Bad address %#lx", vaddr); + kill_guest(lg, "Bad address %#lx", vaddr); return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } @@ -399,7 +399,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) /*H:435 And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set * blank_pgdir. */ -static unsigned int new_pgdir(struct lg_cpu *cpu, +static unsigned int new_pgdir(struct lguest *lg, unsigned long gpgdir, int *blank_pgdir) { @@ -407,23 +407,22 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ - next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); + next = random32() % ARRAY_SIZE(lg->pgdirs); /* If it's never been allocated at all before, try now. */ - if (!cpu->lg->pgdirs[next].pgdir) { - cpu->lg->pgdirs[next].pgdir = - (pgd_t *)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[next].pgdir) { + lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); /* If the allocation fails, just keep using the one we have */ - if (!cpu->lg->pgdirs[next].pgdir) - next = cpu->cpu_pgd; + if (!lg->pgdirs[next].pgdir) + next = lg->pgdidx; else /* This is a blank page, so there are no kernel * mappings: caller must map the stack! */ *blank_pgdir = 1; } /* Record which Guest toplevel this shadows. */ - cpu->lg->pgdirs[next].gpgdir = gpgdir; + lg->pgdirs[next].gpgdir = gpgdir; /* Release all the non-kernel mappings. */ - flush_user_mappings(cpu->lg, next); + flush_user_mappings(lg, next); return next; } @@ -433,21 +432,21 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, * Now we've seen all the page table setting and manipulation, let's see what * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ -void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) +void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ - newpgdir = find_pgdir(cpu->lg, pgtable); + newpgdir = find_pgdir(lg, pgtable); /* If not, we allocate or mug an existing one: if it's a fresh one, * repin gets set to 1. */ - if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) - newpgdir = new_pgdir(cpu, pgtable, &repin); + if (newpgdir == ARRAY_SIZE(lg->pgdirs)) + newpgdir = new_pgdir(lg, pgtable, &repin); /* Change the current pgd index to the new one. */ - cpu->cpu_pgd = newpgdir; + lg->pgdidx = newpgdir; /* If it was completely blank, we map in the Guest kernel stack */ if (repin) - pin_stack_pages(cpu); + pin_stack_pages(lg); } /*H:470 Finally, a routine which throws away everything: all PGD entries in all @@ -469,11 +468,11 @@ static void release_all_pagetables(struct lguest *lg) * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as * everything faults back in, but it's rare. */ -void guest_pagetable_clear_all(struct lg_cpu *cpu) +void guest_pagetable_clear_all(struct lguest *lg) { - release_all_pagetables(cpu->lg); + release_all_pagetables(lg); /* We need the Guest kernel stack mapped again. */ - pin_stack_pages(cpu); + pin_stack_pages(lg); } /*:*/ /*M:009 Since we throw away all mappings when a kernel mapping changes, our @@ -498,24 +497,24 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ -static void do_set_pte(struct lg_cpu *cpu, int idx, +static void do_set_pte(struct lguest *lg, int idx, unsigned long vaddr, pte_t gpte) { /* Look up the matching shadow page directory entry. */ - pgd_t *spgd = spgd_addr(cpu, idx, vaddr); + pgd_t *spgd = spgd_addr(lg, idx, vaddr); /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { /* Otherwise, we start by releasing the existing entry. */ - pte_t *spte = spte_addr(*spgd, vaddr); + pte_t *spte = spte_addr(lg, *spgd, vaddr); release_pte(*spte); /* If they're setting this entry as dirty or accessed, we might * as well put that entry they've given us in now. This shaves * 10% off a copy-on-write micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); - *spte = gpte_to_spte(cpu, gpte, + check_gpte(lg, gpte); + *spte = gpte_to_spte(lg, gpte, pte_flags(gpte) & _PAGE_DIRTY); } else /* Otherwise kill it and we can demand_page() it in @@ -534,22 +533,22 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * * The benefit is that when we have to track a new page table, we can copy keep * all the kernel mappings. This speeds up context switch immensely. */ -void guest_set_pte(struct lg_cpu *cpu, +void guest_set_pte(struct lguest *lg, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { /* Kernel mappings must be changed on all top levels. Slow, but * doesn't happen often. */ - if (vaddr >= cpu->lg->kernel_address) { + if (vaddr >= lg->kernel_address) { unsigned int i; - for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) - if (cpu->lg->pgdirs[i].pgdir) - do_set_pte(cpu, i, vaddr, gpte); + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + do_set_pte(lg, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ - int pgdir = find_pgdir(cpu->lg, gpgdir); - if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) + int pgdir = find_pgdir(lg, gpgdir); + if (pgdir != ARRAY_SIZE(lg->pgdirs)) /* If so, do the update. */ - do_set_pte(cpu, pgdir, vaddr, gpte); + do_set_pte(lg, pgdir, vaddr, gpte); } } @@ -591,32 +590,30 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) { /* We start on the first shadow page table, and give it a blank PGD * page. */ - lg->pgdirs[0].gpgdir = pgtable; - lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); - if (!lg->pgdirs[0].pgdir) + lg->pgdidx = 0; + lg->pgdirs[lg->pgdidx].gpgdir = pgtable; + lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[lg->pgdidx].pgdir) return -ENOMEM; - lg->cpus[0].cpu_pgd = 0; return 0; } /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ -void page_table_guest_data_init(struct lg_cpu *cpu) +void page_table_guest_data_init(struct lguest *lg) { /* We get the kernel address: above this is all kernel memory. */ - if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) + if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) /* We tell the Guest that it can't use the top 4MB of virtual * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) - kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); + || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) + || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) + kill_guest(lg, "bad guest page %p", lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ - if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) - kill_guest(cpu, "bad kernel address %#lx", - cpu->lg->kernel_address); + if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) + kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); } /* When a Guest dies, our cleanup is fairly simple. */ @@ -637,18 +634,17 @@ void free_guest_pagetable(struct lguest *lg) * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which * Guest is about to run on this CPU. */ -void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) +void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); pgd_t switcher_pgd; pte_t regs_pte; - unsigned long pfn; /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); + switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); - cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; + lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher @@ -657,8 +653,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out * again. */ - pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); + regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; } /*:*/ diff --git a/trunk/drivers/lguest/segments.c b/trunk/drivers/lguest/segments.c index ec6aa3f1c36b..9e189cbec7dd 100644 --- a/trunk/drivers/lguest/segments.c +++ b/trunk/drivers/lguest/segments.c @@ -58,7 +58,7 @@ static int ignored_gdt(unsigned int num) * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a * mess: the message will be "unhandled trap 256". */ -static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) +static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) { unsigned int i; @@ -71,14 +71,14 @@ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) /* Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's * running at privilege level 1. If so, we fix it here. */ - if ((cpu->arch.gdt[i].b & 0x00006000) == 0) - cpu->arch.gdt[i].b |= (GUEST_PL << 13); + if ((lg->arch.gdt[i].b & 0x00006000) == 0) + lg->arch.gdt[i].b |= (GUEST_PL << 13); /* Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't * writable by the Guest, so bad things can happen. */ - cpu->arch.gdt[i].b |= 0x00000100; + lg->arch.gdt[i].b |= 0x00000100; } } @@ -109,31 +109,31 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) /* This routine sets up the initial Guest GDT for booting. All entries start * as 0 (unusable). */ -void setup_guest_gdt(struct lg_cpu *cpu) +void setup_guest_gdt(struct lguest *lg) { /* Start with full 0-4G segments... */ - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; + lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; + lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; /* ...except the Guest is allowed to use them, so set the privilege * level appropriately in the flags. */ - cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); - cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); + lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); + lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" * entries. */ -void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) +void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) { unsigned int i; for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) - gdt[i] = cpu->arch.gdt[i]; + gdt[i] = lg->arch.gdt[i]; } /*H:640 When the Guest is run on a different CPU, or the GDT entries have * changed, copy_gdt() is called to copy the Guest's GDT entries across to this * CPU's GDT. */ -void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) +void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) { unsigned int i; @@ -141,38 +141,38 @@ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) * replaced. See ignored_gdt() above. */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) - gdt[i] = cpu->arch.gdt[i]; + gdt[i] = lg->arch.gdt[i]; } /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). * We copy it from the Guest and tweak the entries. */ -void load_guest_gdt(struct lg_cpu *cpu, unsigned long table, u32 num) +void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num > ARRAY_SIZE(cpu->arch.gdt)) - kill_guest(cpu, "too many gdt entries %i", num); + if (num > ARRAY_SIZE(lg->arch.gdt)) + kill_guest(lg, "too many gdt entries %i", num); /* We read the whole thing in, then fix it up. */ - __lgread(cpu, cpu->arch.gdt, table, num * sizeof(cpu->arch.gdt[0])); - fixup_gdt_table(cpu, 0, ARRAY_SIZE(cpu->arch.gdt)); + __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); + fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); /* Mark that the GDT changed so the core knows it has to copy it again, * even if the Guest is run on the same CPU. */ - cpu->changed |= CHANGED_GDT; + lg->changed |= CHANGED_GDT; } /* This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover * both cases? */ -void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) +void guest_load_tls(struct lguest *lg, unsigned long gtls) { - struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; + struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; - __lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); - fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); + __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); /* Note that just the TLS entries have changed. */ - cpu->changed |= CHANGED_GDT_TLS; + lg->changed |= CHANGED_GDT_TLS; } /*:*/ diff --git a/trunk/drivers/lguest/x86/core.c b/trunk/drivers/lguest/x86/core.c index 61f2f8eb8cad..44adb00e1490 100644 --- a/trunk/drivers/lguest/x86/core.c +++ b/trunk/drivers/lguest/x86/core.c @@ -60,7 +60,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); } -static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); +static DEFINE_PER_CPU(struct lguest *, last_guest); /*S:010 * We approach the Switcher. @@ -73,16 +73,16 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); * since it last ran. We saw this set in interrupts_and_traps.c and * segments.c. */ -static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) +static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) { /* Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the * Guest has changed. */ - if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { - __get_cpu_var(last_cpu) = cpu; - cpu->last_pages = pages; - cpu->changed = CHANGED_ALL; + if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { + __get_cpu_var(last_guest) = lg; + lg->last_pages = pages; + lg->changed = CHANGED_ALL; } /* These copies are pretty cheap, so we do them unconditionally: */ @@ -90,42 +90,42 @@ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) pages->state.host_cr3 = __pa(current->mm->pgd); /* Set up the Guest's page tables to see this CPU's pages (and no * other CPU's pages). */ - map_switcher_in_guest(cpu, pages); + map_switcher_in_guest(lg, pages); /* Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege * level 1). */ - pages->state.guest_tss.esp1 = cpu->esp1; - pages->state.guest_tss.ss1 = cpu->ss1; + pages->state.guest_tss.sp1 = lg->esp1; + pages->state.guest_tss.ss1 = lg->ss1; /* Copy direct-to-Guest trap entries. */ - if (cpu->changed & CHANGED_IDT) - copy_traps(cpu, pages->state.guest_idt, default_idt_entries); + if (lg->changed & CHANGED_IDT) + copy_traps(lg, pages->state.guest_idt, default_idt_entries); /* Copy all GDT entries which the Guest can change. */ - if (cpu->changed & CHANGED_GDT) - copy_gdt(cpu, pages->state.guest_gdt); + if (lg->changed & CHANGED_GDT) + copy_gdt(lg, pages->state.guest_gdt); /* If only the TLS entries have changed, copy them. */ - else if (cpu->changed & CHANGED_GDT_TLS) - copy_gdt_tls(cpu, pages->state.guest_gdt); + else if (lg->changed & CHANGED_GDT_TLS) + copy_gdt_tls(lg, pages->state.guest_gdt); /* Mark the Guest as unchanged for next time. */ - cpu->changed = 0; + lg->changed = 0; } /* Finally: the code to actually call into the Switcher to run the Guest. */ -static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) +static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) { /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; /* Copy the guest-specific information into this CPU's "struct * lguest_pages". */ - copy_in_guest_info(cpu, pages); + copy_in_guest_info(lg, pages); /* Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will * cause us to abort the Guest. */ - cpu->regs->trapnum = 256; + lg->regs->trapnum = 256; /* Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using @@ -143,7 +143,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page * directory. */ - : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) + : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) /* We tell gcc that all these registers could change, * which means we don't have to save and restore them in * the Switcher. */ @@ -161,12 +161,12 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts * are disabled: we own the CPU. */ -void lguest_arch_run_guest(struct lg_cpu *cpu) +void lguest_arch_run_guest(struct lguest *lg) { /* Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ - if (cpu->ts) + if (lg->ts) lguest_set_ts(); /* SYSENTER is an optimized way of doing system calls. We can't allow @@ -180,7 +180,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) /* Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it * was doing. */ - run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); + run_guest_once(lg, lguest_pages(raw_smp_processor_id())); /* Note that the "regs" pointer contains two extra entries which are * not really registers: a trap number which says what interrupt or @@ -191,11 +191,11 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite * cr2, or we could even move off to a different CPU. */ - if (cpu->regs->trapnum == 14) - cpu->arch.last_pagefault = read_cr2(); + if (lg->regs->trapnum == 14) + lg->arch.last_pagefault = read_cr2(); /* Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. */ - else if (cpu->regs->trapnum == 7) + else if (lg->regs->trapnum == 7) math_state_restore(); /* Restore SYSENTER if it's supposed to be on. */ @@ -214,22 +214,22 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome * instructions and skip over it. We return true if we did. */ -static int emulate_insn(struct lg_cpu *cpu) +static int emulate_insn(struct lguest *lg) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; /* The eip contains the *virtual* address of the Guest's instruction: * guest_pa just subtracts the Guest's page_offset. */ - unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); + unsigned long physaddr = guest_pa(lg, lg->regs->eip); /* This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege * level. */ - if ((cpu->regs->cs & 3) != GUEST_PL) + if ((lg->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ - insn = lgread(cpu, physaddr, u8); + insn = lgread(lg, physaddr, u8); /* 0x66 is an "operand prefix". It means it's using the upper 16 bits of the eax register. */ @@ -237,7 +237,7 @@ static int emulate_insn(struct lg_cpu *cpu) shift = 16; /* The instruction is 1 byte so far, read the next byte. */ insnlen = 1; - insn = lgread(cpu, physaddr + insnlen, u8); + insn = lgread(lg, physaddr + insnlen, u8); } /* We can ignore the lower bit for the moment and decode the 4 opcodes @@ -268,26 +268,26 @@ static int emulate_insn(struct lg_cpu *cpu) if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) - cpu->regs->eax = 0xFFFFFFFF; + lg->regs->eax = 0xFFFFFFFF; else - cpu->regs->eax |= (0xFFFF << shift); + lg->regs->eax |= (0xFFFF << shift); } /* Finally, we've "done" the instruction, so move past it. */ - cpu->regs->eip += insnlen; + lg->regs->eip += insnlen; /* Success! */ return 1; } /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ -void lguest_arch_handle_trap(struct lg_cpu *cpu) +void lguest_arch_handle_trap(struct lguest *lg) { - switch (cpu->regs->trapnum) { + switch (lg->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ /* Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go * back into the Guest after we've done it. */ - if (cpu->regs->errcode == 0) { - if (emulate_insn(cpu)) + if (lg->regs->errcode == 0) { + if (emulate_insn(lg)) return; } break; @@ -301,8 +301,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * * The errcode tells whether this was a read or a write, and * whether kernel or userspace code. */ - if (demand_page(cpu, cpu->arch.last_pagefault, - cpu->regs->errcode)) + if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) return; /* OK, it's really not there (or not OK): the Guest needs to @@ -312,16 +311,15 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so * lg->lguest_data could be NULL */ - if (cpu->lg->lguest_data && - put_user(cpu->arch.last_pagefault, - &cpu->lg->lguest_data->cr2)) - kill_guest(cpu, "Writing cr2"); + if (lg->lguest_data && + put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) + kill_guest(lg, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ /* If the Guest doesn't want to know, we already restored the * Floating Point Unit, so we just continue without telling * it. */ - if (!cpu->ts) + if (!lg->ts) return; break; case 32 ... 255: @@ -334,19 +332,19 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) case LGUEST_TRAP_ENTRY: /* Our 'struct hcall_args' maps directly over our regs: we set * up the pointer now to indicate a hypercall is pending. */ - cpu->hcall = (struct hcall_args *)cpu->regs; + lg->hcall = (struct hcall_args *)lg->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ - if (!deliver_trap(cpu, cpu->regs->trapnum)) + if (!deliver_trap(lg, lg->regs->trapnum)) /* If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let * it handle), it dies with a cryptic error message. */ - kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", - cpu->regs->trapnum, cpu->regs->eip, - cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault - : cpu->regs->errcode); + kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", + lg->regs->trapnum, lg->regs->eip, + lg->regs->trapnum == 14 ? lg->arch.last_pagefault + : lg->regs->errcode); } /* Now we can look at each of the routines this calls, in increasing order of @@ -489,17 +487,17 @@ void __exit lguest_arch_host_fini(void) /*H:122 The i386-specific hypercalls simply farm out to the right functions. */ -int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) +int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) { switch (args->arg0) { case LHCALL_LOAD_GDT: - load_guest_gdt(cpu, args->arg1, args->arg2); + load_guest_gdt(lg, args->arg1, args->arg2); break; case LHCALL_LOAD_IDT_ENTRY: - load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3); + load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); break; case LHCALL_LOAD_TLS: - guest_load_tls(cpu, args->arg1); + guest_load_tls(lg, args->arg1); break; default: /* Bad Guest. Bad! */ @@ -509,14 +507,13 @@ int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args) } /*H:126 i386-specific hypercall initialization: */ -int lguest_arch_init_hypercalls(struct lg_cpu *cpu) +int lguest_arch_init_hypercalls(struct lguest *lg) { u32 tsc_speed; /* The pointer to the Guest's "struct lguest_data" is the only * argument. We check that address now. */ - if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, - sizeof(*cpu->lg->lguest_data))) + if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) return -EFAULT; /* Having checked it, we simply set lg->lguest_data to point straight @@ -524,7 +521,7 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid * optimizations. */ - cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; + lg->lguest_data = lg->mem_base + lg->hcall->arg1; /* We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC @@ -537,12 +534,12 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) tsc_speed = tsc_khz; else tsc_speed = 0; - if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz)) + if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) return -EFAULT; /* The interrupt code might not like the system call vector. */ - if (!check_syscall_vector(cpu->lg)) - kill_guest(cpu, "bad syscall vector"); + if (!check_syscall_vector(lg)) + kill_guest(lg, "bad syscall vector"); return 0; } @@ -551,9 +548,9 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) * * Most of the Guest's registers are left alone: we used get_zeroed_page() to * allocate the structure, so they will be 0. */ -void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) +void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) { - struct lguest_regs *regs = cpu->regs; + struct lguest_regs *regs = lg->regs; /* There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment @@ -580,5 +577,5 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) /* There are a couple of GDT entries the Guest expects when first * booting. */ - setup_guest_gdt(cpu); + setup_guest_gdt(lg); } diff --git a/trunk/drivers/s390/scsi/zfcp_fsf.c b/trunk/drivers/s390/scsi/zfcp_fsf.c index 0dff05840ee2..e45f85f7c7ed 100644 --- a/trunk/drivers/s390/scsi/zfcp_fsf.c +++ b/trunk/drivers/s390/scsi/zfcp_fsf.c @@ -4224,10 +4224,10 @@ zfcp_fsf_send_fcp_command_task_handler(struct zfcp_fsf_req *fsf_req) ZFCP_LOG_TRACE("%i bytes sense data provided by FCP\n", fcp_rsp_iu->fcp_sns_len); - memcpy(scpnt->sense_buffer, + memcpy(&scpnt->sense_buffer, zfcp_get_fcp_sns_info_ptr(fcp_rsp_iu), sns_len); ZFCP_HEX_DUMP(ZFCP_LOG_LEVEL_TRACE, - (void *)scpnt->sense_buffer, sns_len); + (void *) &scpnt->sense_buffer, sns_len); } /* check for overrun */ diff --git a/trunk/drivers/scsi/3w-9xxx.c b/trunk/drivers/scsi/3w-9xxx.c index b4912d1cee2a..1c244832c6c8 100644 --- a/trunk/drivers/scsi/3w-9xxx.c +++ b/trunk/drivers/scsi/3w-9xxx.c @@ -1990,6 +1990,7 @@ static struct scsi_host_template driver_template = { .max_sectors = TW_MAX_SECTORS, .cmd_per_lun = TW_MAX_CMDS_PER_LUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = twa_host_attrs, .emulated = 1 }; diff --git a/trunk/drivers/scsi/3w-xxxx.c b/trunk/drivers/scsi/3w-xxxx.c index d09532162217..59716ebeb10c 100644 --- a/trunk/drivers/scsi/3w-xxxx.c +++ b/trunk/drivers/scsi/3w-xxxx.c @@ -2261,6 +2261,7 @@ static struct scsi_host_template driver_template = { .max_sectors = TW_MAX_SECTORS, .cmd_per_lun = TW_MAX_CMDS_PER_LUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = tw_host_attrs, .emulated = 1 }; diff --git a/trunk/drivers/scsi/BusLogic.c b/trunk/drivers/scsi/BusLogic.c index 4d3ebb1af490..ead47c143ce0 100644 --- a/trunk/drivers/scsi/BusLogic.c +++ b/trunk/drivers/scsi/BusLogic.c @@ -3575,6 +3575,7 @@ static struct scsi_host_template Bus_Logic_template = { .unchecked_isa_dma = 1, .max_sectors = 128, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; /* diff --git a/trunk/drivers/scsi/Kconfig b/trunk/drivers/scsi/Kconfig index 14fc7f39e83e..3e161cd66463 100644 --- a/trunk/drivers/scsi/Kconfig +++ b/trunk/drivers/scsi/Kconfig @@ -345,7 +345,7 @@ config ISCSI_TCP config SGIWD93_SCSI tristate "SGI WD93C93 SCSI Driver" - depends on SGI_HAS_WD93 && SCSI + depends on SGI_IP22 && SCSI help If you have a Western Digital WD93 SCSI controller on an SGI MIPS system, say Y. Otherwise, say N. diff --git a/trunk/drivers/scsi/NCR53c406a.c b/trunk/drivers/scsi/NCR53c406a.c index 6961f78742ae..137d065db3da 100644 --- a/trunk/drivers/scsi/NCR53c406a.c +++ b/trunk/drivers/scsi/NCR53c406a.c @@ -1065,6 +1065,7 @@ static struct scsi_host_template driver_template = .cmd_per_lun = 1 /* commands per lun */, .unchecked_isa_dma = 1 /* unchecked_isa_dma */, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/trunk/drivers/scsi/a100u2w.c b/trunk/drivers/scsi/a100u2w.c index f608d4a1d6da..d3a6d15fb77a 100644 --- a/trunk/drivers/scsi/a100u2w.c +++ b/trunk/drivers/scsi/a100u2w.c @@ -1071,6 +1071,7 @@ static struct scsi_host_template inia100_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int __devinit inia100_probe_one(struct pci_dev *pdev, diff --git a/trunk/drivers/scsi/aacraid/commctrl.c b/trunk/drivers/scsi/aacraid/commctrl.c index f8afa358b6b6..851a7e599c50 100644 --- a/trunk/drivers/scsi/aacraid/commctrl.c +++ b/trunk/drivers/scsi/aacraid/commctrl.c @@ -243,6 +243,7 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) * Search the list of AdapterFibContext addresses on the adapter * to be sure this is a valid address */ + spin_lock_irqsave(&dev->fib_lock, flags); entry = dev->fib_list.next; fibctx = NULL; @@ -251,24 +252,25 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) /* * Extract the AdapterFibContext from the Input parameters. */ - if (fibctx->unique == f.fibctx) { /* We found a winner */ + if (fibctx->unique == f.fibctx) { /* We found a winner */ break; } entry = entry->next; fibctx = NULL; } if (!fibctx) { + spin_unlock_irqrestore(&dev->fib_lock, flags); dprintk ((KERN_INFO "Fib Context not found\n")); return -EINVAL; } if((fibctx->type != FSAFS_NTC_GET_ADAPTER_FIB_CONTEXT) || (fibctx->size != sizeof(struct aac_fib_context))) { + spin_unlock_irqrestore(&dev->fib_lock, flags); dprintk ((KERN_INFO "Fib Context corrupt?\n")); return -EINVAL; } status = 0; - spin_lock_irqsave(&dev->fib_lock, flags); /* * If there are no fibs to send back, then either wait or return * -EAGAIN @@ -326,7 +328,9 @@ static int next_getadapter_fib(struct aac_dev * dev, void __user *arg) int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) { struct fib *fib; + unsigned long flags; + spin_lock_irqsave(&dev->fib_lock, flags); /* * First free any FIBs that have not been consumed. */ @@ -349,6 +353,7 @@ int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context * fibctx) * Remove the Context from the AdapterFibContext List */ list_del(&fibctx->next); + spin_unlock_irqrestore(&dev->fib_lock, flags); /* * Invalidate context */ @@ -414,8 +419,8 @@ static int close_getadapter_fib(struct aac_dev * dev, void __user *arg) * @arg: ioctl arguments * * This routine returns the driver version. - * Under Linux, there have been no version incompatibilities, so this is - * simple! + * Under Linux, there have been no version incompatibilities, so this is + * simple! */ static int check_revision(struct aac_dev *dev, void __user *arg) @@ -463,7 +468,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) u32 data_dir; void __user *sg_user[32]; void *sg_list[32]; - u32 sg_indx = 0; + u32 sg_indx = 0; u32 byte_count = 0; u32 actual_fibsize64, actual_fibsize = 0; int i; @@ -517,11 +522,11 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg) // Fix up srb for endian and force some values srbcmd->function = cpu_to_le32(SRBF_ExecuteScsi); // Force this - srbcmd->channel = cpu_to_le32(user_srbcmd->channel); + srbcmd->channel = cpu_to_le32(user_srbcmd->channel); srbcmd->id = cpu_to_le32(user_srbcmd->id); - srbcmd->lun = cpu_to_le32(user_srbcmd->lun); - srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); - srbcmd->flags = cpu_to_le32(flags); + srbcmd->lun = cpu_to_le32(user_srbcmd->lun); + srbcmd->timeout = cpu_to_le32(user_srbcmd->timeout); + srbcmd->flags = cpu_to_le32(flags); srbcmd->retry_limit = 0; // Obsolete parameter srbcmd->cdb_size = cpu_to_le32(user_srbcmd->cdb_size); memcpy(srbcmd->cdb, user_srbcmd->cdb, sizeof(srbcmd->cdb)); @@ -786,9 +791,9 @@ static int aac_get_pci_info(struct aac_dev* dev, void __user *arg) pci_info.bus = dev->pdev->bus->number; pci_info.slot = PCI_SLOT(dev->pdev->devfn); - if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { - dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); - return -EFAULT; + if (copy_to_user(arg, &pci_info, sizeof(struct aac_pci_info))) { + dprintk((KERN_DEBUG "aacraid: Could not copy pci info\n")); + return -EFAULT; } return 0; } diff --git a/trunk/drivers/scsi/aacraid/linit.c b/trunk/drivers/scsi/aacraid/linit.c index 0e8267c1e915..61be22774e99 100644 --- a/trunk/drivers/scsi/aacraid/linit.c +++ b/trunk/drivers/scsi/aacraid/linit.c @@ -1032,6 +1032,7 @@ static struct scsi_host_template aac_driver_template = { .cmd_per_lun = AAC_NUM_IO_FIB, #endif .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .emulated = 1, }; diff --git a/trunk/drivers/scsi/aha1740.c b/trunk/drivers/scsi/aha1740.c index 7c45d88a205b..be58a0b097c7 100644 --- a/trunk/drivers/scsi/aha1740.c +++ b/trunk/drivers/scsi/aha1740.c @@ -563,6 +563,7 @@ static struct scsi_host_template aha1740_template = { .sg_tablesize = AHA1740_SCATTER, .cmd_per_lun = AHA1740_CMDLUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = aha1740_eh_abort_handler, }; diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx.h b/trunk/drivers/scsi/aic7xxx/aic79xx.h index 2f00467b6b8c..ce638aa6005a 100644 --- a/trunk/drivers/scsi/aic7xxx/aic79xx.h +++ b/trunk/drivers/scsi/aic7xxx/aic79xx.h @@ -1340,10 +1340,8 @@ struct ahd_pci_identity *ahd_find_pci_device(ahd_dev_softc_t); int ahd_pci_config(struct ahd_softc *, struct ahd_pci_identity *); int ahd_pci_test_register_access(struct ahd_softc *); -#ifdef CONFIG_PM void ahd_pci_suspend(struct ahd_softc *); void ahd_pci_resume(struct ahd_softc *); -#endif /************************** SCB and SCB queue management **********************/ void ahd_qinfifo_requeue_tail(struct ahd_softc *ahd, @@ -1354,10 +1352,8 @@ struct ahd_softc *ahd_alloc(void *platform_arg, char *name); int ahd_softc_init(struct ahd_softc *); void ahd_controller_info(struct ahd_softc *ahd, char *buf); int ahd_init(struct ahd_softc *ahd); -#ifdef CONFIG_PM int ahd_suspend(struct ahd_softc *ahd); void ahd_resume(struct ahd_softc *ahd); -#endif int ahd_default_config(struct ahd_softc *ahd); int ahd_parse_vpddata(struct ahd_softc *ahd, struct vpd_config *vpd); @@ -1365,6 +1361,7 @@ int ahd_parse_cfgdata(struct ahd_softc *ahd, struct seeprom_config *sc); void ahd_intr_enable(struct ahd_softc *ahd, int enable); void ahd_pause_and_flushwork(struct ahd_softc *ahd); +int ahd_suspend(struct ahd_softc *ahd); void ahd_set_unit(struct ahd_softc *, int); void ahd_set_name(struct ahd_softc *, char *); struct scb *ahd_get_scb(struct ahd_softc *ahd, u_int col_idx); diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_core.c b/trunk/drivers/scsi/aic7xxx/aic79xx_core.c index ade0fb8fbdb2..a7dd8cdda472 100644 --- a/trunk/drivers/scsi/aic7xxx/aic79xx_core.c +++ b/trunk/drivers/scsi/aic7xxx/aic79xx_core.c @@ -7175,7 +7175,6 @@ ahd_pause_and_flushwork(struct ahd_softc *ahd) ahd->flags &= ~AHD_ALL_INTERRUPTS; } -#ifdef CONFIG_PM int ahd_suspend(struct ahd_softc *ahd) { @@ -7198,7 +7197,6 @@ ahd_resume(struct ahd_softc *ahd) ahd_intr_enable(ahd, TRUE); ahd_restart(ahd); } -#endif /************************** Busy Target Table *********************************/ /* diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c b/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c index 014654792901..0e4708fd43c8 100644 --- a/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/trunk/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -766,6 +766,7 @@ struct scsi_host_template aic79xx_driver_template = { .max_sectors = 8192, .cmd_per_lun = 2, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .slave_alloc = ahd_linux_slave_alloc, .slave_configure = ahd_linux_slave_configure, .target_alloc = ahd_linux_target_alloc, @@ -1921,7 +1922,7 @@ ahd_linux_queue_cmd_complete(struct ahd_softc *ahd, struct scsi_cmnd *cmd) struct scsi_sense_data *sense; sense = (struct scsi_sense_data *) - cmd->sense_buffer; + &cmd->sense_buffer; if (sense->extra_len >= 5 && (sense->add_sense_code == 0x47 || sense->add_sense_code == 0x48)) diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c b/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c index 4150c8a8fdc2..66f0259edb69 100644 --- a/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c +++ b/trunk/drivers/scsi/aic7xxx/aic79xx_osm_pci.c @@ -43,6 +43,17 @@ #include "aic79xx_inline.h" #include "aic79xx_pci.h" +static int ahd_linux_pci_dev_probe(struct pci_dev *pdev, + const struct pci_device_id *ent); +static int ahd_linux_pci_reserve_io_regions(struct ahd_softc *ahd, + u_long *base, u_long *base2); +static int ahd_linux_pci_reserve_mem_region(struct ahd_softc *ahd, + u_long *bus_addr, + uint8_t __iomem **maddr); +static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); +static int ahd_linux_pci_dev_resume(struct pci_dev *pdev); +static void ahd_linux_pci_dev_remove(struct pci_dev *pdev); + /* Define the macro locally since it's different for different class of chips. */ #define ID(x) \ @@ -74,7 +85,17 @@ static struct pci_device_id ahd_linux_pci_id_table[] = { MODULE_DEVICE_TABLE(pci, ahd_linux_pci_id_table); +static struct pci_driver aic79xx_pci_driver = { + .name = "aic79xx", + .probe = ahd_linux_pci_dev_probe, #ifdef CONFIG_PM + .suspend = ahd_linux_pci_dev_suspend, + .resume = ahd_linux_pci_dev_resume, +#endif + .remove = ahd_linux_pci_dev_remove, + .id_table = ahd_linux_pci_id_table +}; + static int ahd_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) { @@ -118,7 +139,6 @@ ahd_linux_pci_dev_resume(struct pci_dev *pdev) return rc; } -#endif static void ahd_linux_pci_dev_remove(struct pci_dev *pdev) @@ -225,17 +245,6 @@ ahd_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return (0); } -static struct pci_driver aic79xx_pci_driver = { - .name = "aic79xx", - .probe = ahd_linux_pci_dev_probe, -#ifdef CONFIG_PM - .suspend = ahd_linux_pci_dev_suspend, - .resume = ahd_linux_pci_dev_resume, -#endif - .remove = ahd_linux_pci_dev_remove, - .id_table = ahd_linux_pci_id_table -}; - int ahd_linux_pci_init(void) { diff --git a/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c b/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c index df853676e66a..7a203a90601a 100644 --- a/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c +++ b/trunk/drivers/scsi/aic7xxx/aic79xx_pci.c @@ -389,7 +389,6 @@ ahd_pci_config(struct ahd_softc *ahd, struct ahd_pci_identity *entry) return error; } -#ifdef CONFIG_PM void ahd_pci_suspend(struct ahd_softc *ahd) { @@ -416,7 +415,6 @@ ahd_pci_resume(struct ahd_softc *ahd) ahd_pci_write_config(ahd->dev_softc, CSIZE_LATTIME, ahd->suspend_state.pci_state.csize_lattime, /*bytes*/1); } -#endif /* * Perform some simple tests that should catch situations where diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx.h b/trunk/drivers/scsi/aic7xxx/aic7xxx.h index c0344e617651..3d4e42d90452 100644 --- a/trunk/drivers/scsi/aic7xxx/aic7xxx.h +++ b/trunk/drivers/scsi/aic7xxx/aic7xxx.h @@ -1143,9 +1143,7 @@ struct ahc_pci_identity *ahc_find_pci_device(ahc_dev_softc_t); int ahc_pci_config(struct ahc_softc *, struct ahc_pci_identity *); int ahc_pci_test_register_access(struct ahc_softc *); -#ifdef CONFIG_PM void ahc_pci_resume(struct ahc_softc *ahc); -#endif /*************************** EISA/VL Front End ********************************/ struct aic7770_identity *aic7770_find_device(uint32_t); @@ -1172,10 +1170,8 @@ int ahc_chip_init(struct ahc_softc *ahc); int ahc_init(struct ahc_softc *ahc); void ahc_intr_enable(struct ahc_softc *ahc, int enable); void ahc_pause_and_flushwork(struct ahc_softc *ahc); -#ifdef CONFIG_PM int ahc_suspend(struct ahc_softc *ahc); int ahc_resume(struct ahc_softc *ahc); -#endif void ahc_set_unit(struct ahc_softc *, int); void ahc_set_name(struct ahc_softc *, char *); void ahc_alloc_scbs(struct ahc_softc *ahc); diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c index 6d2ae641273c..f350b5e89e76 100644 --- a/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c +++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_core.c @@ -5078,7 +5078,6 @@ ahc_pause_and_flushwork(struct ahc_softc *ahc) ahc->flags &= ~AHC_ALL_INTERRUPTS; } -#ifdef CONFIG_PM int ahc_suspend(struct ahc_softc *ahc) { @@ -5114,7 +5113,7 @@ ahc_resume(struct ahc_softc *ahc) ahc_restart(ahc); return (0); } -#endif + /************************** Busy Target Table *********************************/ /* * Return the untagged transaction id for a given target/channel lun. diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c index 99a3b33a3233..e310e414067f 100644 --- a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -747,6 +747,7 @@ struct scsi_host_template aic7xxx_driver_template = { .max_sectors = 8192, .cmd_per_lun = 2, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .slave_alloc = ahc_linux_slave_alloc, .slave_configure = ahc_linux_slave_configure, .target_alloc = ahc_linux_target_alloc, @@ -1657,12 +1658,9 @@ ahc_done(struct ahc_softc *ahc, struct scb *scb) untagged_q = &(ahc->untagged_queues[target_offset]); TAILQ_REMOVE(untagged_q, scb, links.tqe); BUG_ON(!TAILQ_EMPTY(untagged_q)); - } else if ((scb->flags & SCB_ACTIVE) == 0) { - /* - * Transactions aborted from the untagged queue may - * not have been dispatched to the controller, so - * only check the SCB_ACTIVE flag for tagged transactions. - */ + } + + if ((scb->flags & SCB_ACTIVE) == 0) { printf("SCB %d done'd twice\n", scb->hscb->tag); ahc_dump_card_state(ahc); panic("Stopping for safety"); diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c index dd6e21d6f1dd..4488946cff2e 100644 --- a/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c +++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_osm_pci.c @@ -42,6 +42,17 @@ #include "aic7xxx_osm.h" #include "aic7xxx_pci.h" +static int ahc_linux_pci_dev_probe(struct pci_dev *pdev, + const struct pci_device_id *ent); +static int ahc_linux_pci_reserve_io_region(struct ahc_softc *ahc, + u_long *base); +static int ahc_linux_pci_reserve_mem_region(struct ahc_softc *ahc, + u_long *bus_addr, + uint8_t __iomem **maddr); +static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg); +static int ahc_linux_pci_dev_resume(struct pci_dev *pdev); +static void ahc_linux_pci_dev_remove(struct pci_dev *pdev); + /* Define the macro locally since it's different for different class of chips. */ #define ID(x) ID_C(x, PCI_CLASS_STORAGE_SCSI) @@ -121,7 +132,17 @@ static struct pci_device_id ahc_linux_pci_id_table[] = { MODULE_DEVICE_TABLE(pci, ahc_linux_pci_id_table); +static struct pci_driver aic7xxx_pci_driver = { + .name = "aic7xxx", + .probe = ahc_linux_pci_dev_probe, #ifdef CONFIG_PM + .suspend = ahc_linux_pci_dev_suspend, + .resume = ahc_linux_pci_dev_resume, +#endif + .remove = ahc_linux_pci_dev_remove, + .id_table = ahc_linux_pci_id_table +}; + static int ahc_linux_pci_dev_suspend(struct pci_dev *pdev, pm_message_t mesg) { @@ -161,7 +182,6 @@ ahc_linux_pci_dev_resume(struct pci_dev *pdev) return (ahc_resume(ahc)); } -#endif static void ahc_linux_pci_dev_remove(struct pci_dev *pdev) @@ -269,17 +289,6 @@ ahc_linux_pci_dev_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return (0); } -static struct pci_driver aic7xxx_pci_driver = { - .name = "aic7xxx", - .probe = ahc_linux_pci_dev_probe, -#ifdef CONFIG_PM - .suspend = ahc_linux_pci_dev_suspend, - .resume = ahc_linux_pci_dev_resume, -#endif - .remove = ahc_linux_pci_dev_remove, - .id_table = ahc_linux_pci_id_table -}; - int ahc_linux_pci_init(void) { diff --git a/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c b/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c index 56848f41e4f9..ae35937b8055 100644 --- a/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c +++ b/trunk/drivers/scsi/aic7xxx/aic7xxx_pci.c @@ -2020,7 +2020,6 @@ ahc_pci_chip_init(struct ahc_softc *ahc) return (ahc_chip_init(ahc)); } -#ifdef CONFIG_PM void ahc_pci_resume(struct ahc_softc *ahc) { @@ -2052,7 +2051,6 @@ ahc_pci_resume(struct ahc_softc *ahc) ahc_release_seeprom(&sd); } } -#endif static int ahc_aic785X_setup(struct ahc_softc *ahc) diff --git a/trunk/drivers/scsi/aic7xxx_old.c b/trunk/drivers/scsi/aic7xxx_old.c index 3bfd9296bbfa..bcb0b870320c 100644 --- a/trunk/drivers/scsi/aic7xxx_old.c +++ b/trunk/drivers/scsi/aic7xxx_old.c @@ -11141,6 +11141,7 @@ static struct scsi_host_template driver_template = { .max_sectors = 2048, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/trunk/drivers/scsi/arcmsr/arcmsr_hba.c b/trunk/drivers/scsi/arcmsr/arcmsr_hba.c index f4a202e8df26..d80dba913a75 100644 --- a/trunk/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/trunk/drivers/scsi/arcmsr/arcmsr_hba.c @@ -122,6 +122,7 @@ static struct scsi_host_template arcmsr_scsi_host_template = { .max_sectors = ARCMSR_MAX_XFER_SECTORS, .cmd_per_lun = ARCMSR_MAX_CMD_PERLUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = arcmsr_host_attrs, }; #ifdef CONFIG_SCSI_ARCMSR_AER diff --git a/trunk/drivers/scsi/dc395x.c b/trunk/drivers/scsi/dc395x.c index 22ef3716e786..f93c73c0ba53 100644 --- a/trunk/drivers/scsi/dc395x.c +++ b/trunk/drivers/scsi/dc395x.c @@ -4763,6 +4763,7 @@ static struct scsi_host_template dc395x_driver_template = { .eh_bus_reset_handler = dc395x_eh_bus_reset, .unchecked_isa_dma = 0, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; diff --git a/trunk/drivers/scsi/dpt_i2o.c b/trunk/drivers/scsi/dpt_i2o.c index c9dd8392aab2..19cce125124c 100644 --- a/trunk/drivers/scsi/dpt_i2o.c +++ b/trunk/drivers/scsi/dpt_i2o.c @@ -3340,6 +3340,7 @@ static struct scsi_host_template driver_template = { .this_id = 7, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" MODULE_LICENSE("GPL"); diff --git a/trunk/drivers/scsi/eata.c b/trunk/drivers/scsi/eata.c index 8be3d76656fa..05163cefec12 100644 --- a/trunk/drivers/scsi/eata.c +++ b/trunk/drivers/scsi/eata.c @@ -524,6 +524,7 @@ static struct scsi_host_template driver_template = { .this_id = 7, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/trunk/drivers/scsi/hosts.c b/trunk/drivers/scsi/hosts.c index 880c78bff0e1..5ea1f986220c 100644 --- a/trunk/drivers/scsi/hosts.c +++ b/trunk/drivers/scsi/hosts.c @@ -342,6 +342,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost->use_clustering = sht->use_clustering; shost->ordered_tag = sht->ordered_tag; shost->active_mode = sht->supported_mode; + shost->use_sg_chaining = sht->use_sg_chaining; if (sht->supported_mode == MODE_UNKNOWN) /* means we didn't set it ... default to INITIATOR */ diff --git a/trunk/drivers/scsi/hptiop.c b/trunk/drivers/scsi/hptiop.c index ff149ad6bc4e..e7b2f3575ce9 100644 --- a/trunk/drivers/scsi/hptiop.c +++ b/trunk/drivers/scsi/hptiop.c @@ -573,7 +573,7 @@ static void hptiop_finish_scsi_req(struct hptiop_hba *hba, u32 tag, scsi_set_resid(scp, scsi_bufflen(scp) - le32_to_cpu(req->dataxfer_length)); scp->result = SAM_STAT_CHECK_CONDITION; - memcpy(scp->sense_buffer, &req->sg_list, + memcpy(&scp->sense_buffer, &req->sg_list, min_t(size_t, SCSI_SENSE_BUFFERSIZE, le32_to_cpu(req->dataxfer_length))); break; @@ -906,6 +906,7 @@ static struct scsi_host_template driver_template = { .unchecked_isa_dma = 0, .emulated = 0, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .proc_name = driver_name, .shost_attrs = hptiop_attrs, .this_id = -1, diff --git a/trunk/drivers/scsi/ibmmca.c b/trunk/drivers/scsi/ibmmca.c index 4d15a62914e9..db004a450732 100644 --- a/trunk/drivers/scsi/ibmmca.c +++ b/trunk/drivers/scsi/ibmmca.c @@ -1501,6 +1501,7 @@ static struct scsi_host_template ibmmca_driver_template = { .sg_tablesize = 16, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int ibmmca_probe(struct device *dev) diff --git a/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c b/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c index 78d46a900bb5..30819012898f 100644 --- a/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/trunk/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -1600,6 +1600,7 @@ static struct scsi_host_template driver_template = { .this_id = -1, .sg_tablesize = SG_ALL, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = ibmvscsi_attrs, }; diff --git a/trunk/drivers/scsi/initio.c b/trunk/drivers/scsi/initio.c index 0cc8868ea35d..a10a5c74b48d 100644 --- a/trunk/drivers/scsi/initio.c +++ b/trunk/drivers/scsi/initio.c @@ -2833,6 +2833,7 @@ static struct scsi_host_template initio_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int initio_probe_one(struct pci_dev *pdev, diff --git a/trunk/drivers/scsi/iscsi_tcp.c b/trunk/drivers/scsi/iscsi_tcp.c index b6f99dfbb038..e5be5fd4ef58 100644 --- a/trunk/drivers/scsi/iscsi_tcp.c +++ b/trunk/drivers/scsi/iscsi_tcp.c @@ -1933,6 +1933,7 @@ static struct scsi_host_template iscsi_sht = { .eh_device_reset_handler= iscsi_eh_device_reset, .eh_host_reset_handler = iscsi_eh_host_reset, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .slave_configure = iscsi_tcp_slave_configure, .proc_name = "iscsi_tcp", .this_id = -1, diff --git a/trunk/drivers/scsi/libsrp.c b/trunk/drivers/scsi/libsrp.c index 6d6a76e65a6c..5cff0204227d 100644 --- a/trunk/drivers/scsi/libsrp.c +++ b/trunk/drivers/scsi/libsrp.c @@ -426,8 +426,8 @@ int srp_cmd_queue(struct Scsi_Host *shost, struct srp_cmd *cmd, void *info, sc->SCp.ptr = info; memcpy(sc->cmnd, cmd->cdb, MAX_COMMAND_SIZE); - sc->sdb.length = len; - sc->sdb.table.sgl = (void *) (unsigned long) addr; + sc->request_bufflen = len; + sc->request_buffer = (void *) (unsigned long) addr; sc->tag = tag; err = scsi_tgt_queue_command(sc, itn_id, (struct scsi_lun *)&cmd->lun, cmd->tag); diff --git a/trunk/drivers/scsi/lpfc/lpfc_scsi.c b/trunk/drivers/scsi/lpfc/lpfc_scsi.c index fc5c3a42b05a..6483c62730b3 100644 --- a/trunk/drivers/scsi/lpfc/lpfc_scsi.c +++ b/trunk/drivers/scsi/lpfc/lpfc_scsi.c @@ -1459,6 +1459,7 @@ struct scsi_host_template lpfc_template = { .scan_finished = lpfc_scan_finished, .this_id = -1, .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, + .use_sg_chaining = ENABLE_SG_CHAINING, .cmd_per_lun = LPFC_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = lpfc_hba_attrs, @@ -1481,6 +1482,7 @@ struct scsi_host_template lpfc_vport_template = { .sg_tablesize = LPFC_DEFAULT_SG_SEG_CNT, .cmd_per_lun = LPFC_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = lpfc_vport_attrs, .max_sectors = 0xFFFF, }; diff --git a/trunk/drivers/scsi/mac53c94.c b/trunk/drivers/scsi/mac53c94.c index b12ad7c7c673..a035001f4438 100644 --- a/trunk/drivers/scsi/mac53c94.c +++ b/trunk/drivers/scsi/mac53c94.c @@ -402,6 +402,7 @@ static struct scsi_host_template mac53c94_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match) diff --git a/trunk/drivers/scsi/megaraid.c b/trunk/drivers/scsi/megaraid.c index 4d59ae8491a4..765c24d2bc38 100644 --- a/trunk/drivers/scsi/megaraid.c +++ b/trunk/drivers/scsi/megaraid.c @@ -4490,6 +4490,7 @@ static struct scsi_host_template megaraid_template = { .sg_tablesize = MAX_SGLIST, .cmd_per_lun = DEF_CMD_PER_LUN, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = megaraid_abort, .eh_device_reset_handler = megaraid_reset, .eh_bus_reset_handler = megaraid_reset, diff --git a/trunk/drivers/scsi/megaraid/megaraid_mbox.c b/trunk/drivers/scsi/megaraid/megaraid_mbox.c index 6db77c00e3ee..24e32e446e76 100644 --- a/trunk/drivers/scsi/megaraid/megaraid_mbox.c +++ b/trunk/drivers/scsi/megaraid/megaraid_mbox.c @@ -361,6 +361,7 @@ static struct scsi_host_template megaraid_template_g = { .eh_host_reset_handler = megaraid_reset_handler, .change_queue_depth = megaraid_change_queue_depth, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .sdev_attrs = megaraid_sdev_attrs, .shost_attrs = megaraid_shost_attrs, }; diff --git a/trunk/drivers/scsi/megaraid/megaraid_sas.c b/trunk/drivers/scsi/megaraid/megaraid_sas.c index 672c759ac24d..d7ec921865c4 100644 --- a/trunk/drivers/scsi/megaraid/megaraid_sas.c +++ b/trunk/drivers/scsi/megaraid/megaraid_sas.c @@ -1192,6 +1192,7 @@ static struct scsi_host_template megasas_template = { .eh_timed_out = megasas_reset_timer, .bios_param = megasas_bios_param, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; /** diff --git a/trunk/drivers/scsi/mesh.c b/trunk/drivers/scsi/mesh.c index 651d09b08f2a..7470ff39ab22 100644 --- a/trunk/drivers/scsi/mesh.c +++ b/trunk/drivers/scsi/mesh.c @@ -1843,6 +1843,7 @@ static struct scsi_host_template mesh_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 2, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match) diff --git a/trunk/drivers/scsi/ncr53c8xx.c b/trunk/drivers/scsi/ncr53c8xx.c index c5ebf018b378..c02771aa6c9b 100644 --- a/trunk/drivers/scsi/ncr53c8xx.c +++ b/trunk/drivers/scsi/ncr53c8xx.c @@ -4967,7 +4967,7 @@ void ncr_complete (struct ncb *np, struct ccb *cp) sizeof(cp->sense_buf))); if (DEBUG_FLAGS & (DEBUG_RESULT|DEBUG_TINY)) { - u_char *p = cmd->sense_buffer; + u_char * p = (u_char*) & cmd->sense_buffer; int i; PRINT_ADDR(cmd, "sense data:"); for (i=0; i<14; i++) printk (" %x", *p++); diff --git a/trunk/drivers/scsi/nsp32.c b/trunk/drivers/scsi/nsp32.c index 7fed35372150..28161dc95e0d 100644 --- a/trunk/drivers/scsi/nsp32.c +++ b/trunk/drivers/scsi/nsp32.c @@ -281,6 +281,7 @@ static struct scsi_host_template nsp32_template = { .cmd_per_lun = 1, .this_id = NSP32_HOST_SCSIID, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .eh_abort_handler = nsp32_eh_abort, .eh_bus_reset_handler = nsp32_eh_bus_reset, .eh_host_reset_handler = nsp32_eh_host_reset, diff --git a/trunk/drivers/scsi/pcmcia/sym53c500_cs.c b/trunk/drivers/scsi/pcmcia/sym53c500_cs.c index 3454a5714749..969b9387a0c3 100644 --- a/trunk/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/trunk/drivers/scsi/pcmcia/sym53c500_cs.c @@ -692,6 +692,7 @@ static struct scsi_host_template sym53c500_driver_template = { .sg_tablesize = 32, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .shost_attrs = SYM53C500_shost_attrs }; diff --git a/trunk/drivers/scsi/qla1280.c b/trunk/drivers/scsi/qla1280.c index 68c0d09ffe78..c94906abfee3 100644 --- a/trunk/drivers/scsi/qla1280.c +++ b/trunk/drivers/scsi/qla1280.c @@ -4204,6 +4204,7 @@ static struct scsi_host_template qla1280_driver_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; diff --git a/trunk/drivers/scsi/qla2xxx/qla_os.c b/trunk/drivers/scsi/qla2xxx/qla_os.c index 3954ed2d7b51..aba1e6d48066 100644 --- a/trunk/drivers/scsi/qla2xxx/qla_os.c +++ b/trunk/drivers/scsi/qla2xxx/qla_os.c @@ -131,6 +131,7 @@ static struct scsi_host_template qla2x00_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, /* @@ -162,6 +163,7 @@ struct scsi_host_template qla24xx_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, .max_sectors = 0xFFFF, diff --git a/trunk/drivers/scsi/qla4xxx/ql4_os.c b/trunk/drivers/scsi/qla4xxx/ql4_os.c index 2e2b9fedffcc..d3f86646cb08 100644 --- a/trunk/drivers/scsi/qla4xxx/ql4_os.c +++ b/trunk/drivers/scsi/qla4xxx/ql4_os.c @@ -94,6 +94,7 @@ static struct scsi_host_template qla4xxx_driver_template = { .this_id = -1, .cmd_per_lun = 3, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .sg_tablesize = SG_ALL, .max_sectors = 0xFFFF, diff --git a/trunk/drivers/scsi/qlogicfas.c b/trunk/drivers/scsi/qlogicfas.c index 1e874f1fb5c6..1769f965eedf 100644 --- a/trunk/drivers/scsi/qlogicfas.c +++ b/trunk/drivers/scsi/qlogicfas.c @@ -197,6 +197,7 @@ static struct scsi_host_template qlogicfas_driver_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static __init int qlogicfas_init(void) diff --git a/trunk/drivers/scsi/scsi.c b/trunk/drivers/scsi/scsi.c index b35d19472caa..1a9fba6a9f92 100644 --- a/trunk/drivers/scsi/scsi.c +++ b/trunk/drivers/scsi/scsi.c @@ -757,7 +757,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd) "Notifying upper driver of completion " "(result %x)\n", cmd->result)); - good_bytes = scsi_bufflen(cmd); + good_bytes = cmd->request_bufflen; if (cmd->request->cmd_type != REQ_TYPE_BLOCK_PC) { drv = scsi_cmd_to_driver(cmd); if (drv->done) diff --git a/trunk/drivers/scsi/scsi_debug.c b/trunk/drivers/scsi/scsi_debug.c index 1541c174937a..82c06f0a9d02 100644 --- a/trunk/drivers/scsi/scsi_debug.c +++ b/trunk/drivers/scsi/scsi_debug.c @@ -280,8 +280,6 @@ static int resp_write(struct scsi_cmnd * SCpnt, unsigned long long lba, unsigned int num, struct sdebug_dev_info * devip); static int resp_report_luns(struct scsi_cmnd * SCpnt, struct sdebug_dev_info * devip); -static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, - unsigned int num, struct sdebug_dev_info *devip); static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, int arr_len); static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, @@ -313,48 +311,12 @@ static void sdebug_max_tgts_luns(void); static struct device pseudo_primary; static struct bus_type pseudo_lld_bus; -static void get_data_transfer_info(unsigned char *cmd, - unsigned long long *lba, unsigned int *num) -{ - int i; - - switch (*cmd) { - case WRITE_16: - case READ_16: - for (*lba = 0, i = 0; i < 8; ++i) { - if (i > 0) - *lba <<= 8; - *lba += cmd[2 + i]; - } - *num = cmd[13] + (cmd[12] << 8) + - (cmd[11] << 16) + (cmd[10] << 24); - break; - case WRITE_12: - case READ_12: - *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); - *num = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24); - break; - case WRITE_10: - case READ_10: - case XDWRITEREAD_10: - *lba = cmd[5] + (cmd[4] << 8) + (cmd[3] << 16) + (cmd[2] << 24); - *num = cmd[8] + (cmd[7] << 8); - break; - case WRITE_6: - case READ_6: - *lba = cmd[3] + (cmd[2] << 8) + ((cmd[1] & 0x1f) << 16); - *num = (0 == cmd[4]) ? 256 : cmd[4]; - break; - default: - break; - } -} static int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) { unsigned char *cmd = (unsigned char *) SCpnt->cmnd; - int len, k; + int len, k, j; unsigned int num; unsigned long long lba; int errsts = 0; @@ -490,7 +452,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) break; if (scsi_debug_fake_rw) break; - get_data_transfer_info(cmd, &lba, &num); + if ((*cmd) == READ_16) { + for (lba = 0, j = 0; j < 8; ++j) { + if (j > 0) + lba <<= 8; + lba += cmd[2 + j]; + } + num = cmd[13] + (cmd[12] << 8) + + (cmd[11] << 16) + (cmd[10] << 24); + } else if ((*cmd) == READ_12) { + lba = cmd[5] + (cmd[4] << 8) + + (cmd[3] << 16) + (cmd[2] << 24); + num = cmd[9] + (cmd[8] << 8) + + (cmd[7] << 16) + (cmd[6] << 24); + } else if ((*cmd) == READ_10) { + lba = cmd[5] + (cmd[4] << 8) + + (cmd[3] << 16) + (cmd[2] << 24); + num = cmd[8] + (cmd[7] << 8); + } else { /* READ (6) */ + lba = cmd[3] + (cmd[2] << 8) + + ((cmd[1] & 0x1f) << 16); + num = (0 == cmd[4]) ? 256 : cmd[4]; + } errsts = resp_read(SCpnt, lba, num, devip); if (inj_recovered && (0 == errsts)) { mk_sense_buffer(devip, RECOVERED_ERROR, @@ -517,7 +500,28 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) break; if (scsi_debug_fake_rw) break; - get_data_transfer_info(cmd, &lba, &num); + if ((*cmd) == WRITE_16) { + for (lba = 0, j = 0; j < 8; ++j) { + if (j > 0) + lba <<= 8; + lba += cmd[2 + j]; + } + num = cmd[13] + (cmd[12] << 8) + + (cmd[11] << 16) + (cmd[10] << 24); + } else if ((*cmd) == WRITE_12) { + lba = cmd[5] + (cmd[4] << 8) + + (cmd[3] << 16) + (cmd[2] << 24); + num = cmd[9] + (cmd[8] << 8) + + (cmd[7] << 16) + (cmd[6] << 24); + } else if ((*cmd) == WRITE_10) { + lba = cmd[5] + (cmd[4] << 8) + + (cmd[3] << 16) + (cmd[2] << 24); + num = cmd[8] + (cmd[7] << 8); + } else { /* WRITE (6) */ + lba = cmd[3] + (cmd[2] << 8) + + ((cmd[1] & 0x1f) << 16); + num = (0 == cmd[4]) ? 256 : cmd[4]; + } errsts = resp_write(SCpnt, lba, num, devip); if (inj_recovered && (0 == errsts)) { mk_sense_buffer(devip, RECOVERED_ERROR, @@ -545,28 +549,6 @@ int scsi_debug_queuecommand(struct scsi_cmnd * SCpnt, done_funct_t done) case WRITE_BUFFER: errsts = check_readiness(SCpnt, 1, devip); break; - case XDWRITEREAD_10: - if (!scsi_bidi_cmnd(SCpnt)) { - mk_sense_buffer(devip, ILLEGAL_REQUEST, - INVALID_FIELD_IN_CDB, 0); - errsts = check_condition_result; - break; - } - - errsts = check_readiness(SCpnt, 0, devip); - if (errsts) - break; - if (scsi_debug_fake_rw) - break; - get_data_transfer_info(cmd, &lba, &num); - errsts = resp_read(SCpnt, lba, num, devip); - if (errsts) - break; - errsts = resp_write(SCpnt, lba, num, devip); - if (errsts) - break; - errsts = resp_xdwriteread(SCpnt, lba, num, devip); - break; default: if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) printk(KERN_INFO "scsi_debug: Opcode: 0x%x not " @@ -619,18 +601,18 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, int k, req_len, act_len, len, active; void * kaddr; void * kaddr_off; - struct scatterlist *sg; - struct scsi_data_buffer *sdb = scsi_in(scp); + struct scatterlist * sg; - if (!sdb->length) + if (0 == scsi_bufflen(scp)) return 0; - if (!sdb->table.sgl) + if (NULL == scsi_sglist(scp)) return (DID_ERROR << 16); - if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_FROM_DEVICE)) + if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || + (scp->sc_data_direction == DMA_FROM_DEVICE))) return (DID_ERROR << 16); active = 1; req_len = act_len = 0; - for_each_sg(sdb->table.sgl, sg, sdb->table.nents, k) { + scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { if (active) { kaddr = (unsigned char *) kmap_atomic(sg_page(sg), KM_USER0); @@ -648,10 +630,10 @@ static int fill_from_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, } req_len += sg->length; } - if (sdb->resid) - sdb->resid -= act_len; + if (scsi_get_resid(scp)) + scsi_set_resid(scp, scsi_get_resid(scp) - act_len); else - sdb->resid = req_len - act_len; + scsi_set_resid(scp, req_len - act_len); return 0; } @@ -668,7 +650,8 @@ static int fetch_to_dev_buffer(struct scsi_cmnd * scp, unsigned char * arr, return 0; if (NULL == scsi_sglist(scp)) return -1; - if (!(scsi_bidi_cmnd(scp) || scp->sc_data_direction == DMA_TO_DEVICE)) + if (! ((scp->sc_data_direction == DMA_BIDIRECTIONAL) || + (scp->sc_data_direction == DMA_TO_DEVICE))) return -1; req_len = fin = 0; scsi_for_each_sg(scp, sg, scsi_sg_count(scp), k) { @@ -1973,50 +1956,6 @@ static int resp_report_luns(struct scsi_cmnd * scp, min((int)alloc_len, SDEBUG_RLUN_ARR_SZ)); } -static int resp_xdwriteread(struct scsi_cmnd *scp, unsigned long long lba, - unsigned int num, struct sdebug_dev_info *devip) -{ - int i, j, ret = -1; - unsigned char *kaddr, *buf; - unsigned int offset; - struct scatterlist *sg; - struct scsi_data_buffer *sdb = scsi_in(scp); - - /* better not to use temporary buffer. */ - buf = kmalloc(scsi_bufflen(scp), GFP_ATOMIC); - if (!buf) - return ret; - - offset = 0; - scsi_for_each_sg(scp, sg, scsi_sg_count(scp), i) { - kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); - if (!kaddr) - goto out; - - memcpy(buf + offset, kaddr + sg->offset, sg->length); - offset += sg->length; - kunmap_atomic(kaddr, KM_USER0); - } - - offset = 0; - for_each_sg(sdb->table.sgl, sg, sdb->table.nents, i) { - kaddr = (unsigned char *)kmap_atomic(sg_page(sg), KM_USER0); - if (!kaddr) - goto out; - - for (j = 0; j < sg->length; j++) - *(kaddr + sg->offset + j) ^= *(buf + offset + j); - - offset += sg->length; - kunmap_atomic(kaddr, KM_USER0); - } - ret = 0; -out: - kfree(buf); - - return ret; -} - /* When timer goes off this function is called. */ static void timer_intr_handler(unsigned long indx) { @@ -2050,7 +1989,6 @@ static int scsi_debug_slave_alloc(struct scsi_device * sdp) if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %u>\n", sdp->host->host_no, sdp->channel, sdp->id, sdp->lun); - set_bit(QUEUE_FLAG_BIDI, &sdp->request_queue->queue_flags); return 0; } diff --git a/trunk/drivers/scsi/scsi_error.c b/trunk/drivers/scsi/scsi_error.c index 045a0868fc7b..547e85aa414f 100644 --- a/trunk/drivers/scsi/scsi_error.c +++ b/trunk/drivers/scsi/scsi_error.c @@ -617,27 +617,29 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, ses->cmd_len = scmd->cmd_len; memcpy(ses->cmnd, scmd->cmnd, sizeof(scmd->cmnd)); ses->data_direction = scmd->sc_data_direction; - ses->sdb = scmd->sdb; - ses->next_rq = scmd->request->next_rq; + ses->bufflen = scmd->request_bufflen; + ses->buffer = scmd->request_buffer; + ses->use_sg = scmd->use_sg; + ses->resid = scmd->resid; ses->result = scmd->result; - memset(&scmd->sdb, 0, sizeof(scmd->sdb)); - scmd->request->next_rq = NULL; - if (sense_bytes) { - scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, - sense_bytes); + scmd->request_bufflen = min_t(unsigned, + SCSI_SENSE_BUFFERSIZE, sense_bytes); sg_init_one(&ses->sense_sgl, scmd->sense_buffer, - scmd->sdb.length); - scmd->sdb.table.sgl = &ses->sense_sgl; + scmd->request_bufflen); + scmd->request_buffer = &ses->sense_sgl; scmd->sc_data_direction = DMA_FROM_DEVICE; - scmd->sdb.table.nents = 1; + scmd->use_sg = 1; memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); scmd->cmnd[0] = REQUEST_SENSE; - scmd->cmnd[4] = scmd->sdb.length; + scmd->cmnd[4] = scmd->request_bufflen; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } else { + scmd->request_buffer = NULL; + scmd->request_bufflen = 0; scmd->sc_data_direction = DMA_NONE; + scmd->use_sg = 0; if (cmnd) { memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); memcpy(scmd->cmnd, cmnd, cmnd_size); @@ -674,8 +676,10 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) scmd->cmd_len = ses->cmd_len; memcpy(scmd->cmnd, ses->cmnd, sizeof(scmd->cmnd)); scmd->sc_data_direction = ses->data_direction; - scmd->sdb = ses->sdb; - scmd->request->next_rq = ses->next_rq; + scmd->request_bufflen = ses->bufflen; + scmd->request_buffer = ses->buffer; + scmd->use_sg = ses->use_sg; + scmd->resid = ses->resid; scmd->result = ses->result; } EXPORT_SYMBOL(scsi_eh_restore_cmnd); @@ -1696,7 +1700,8 @@ scsi_reset_provider(struct scsi_device *dev, int flag) memset(&scmd->cmnd, '\0', sizeof(scmd->cmnd)); scmd->scsi_done = scsi_reset_provider_done_command; - memset(&scmd->sdb, 0, sizeof(scmd->sdb)); + scmd->request_buffer = NULL; + scmd->request_bufflen = 0; scmd->cmd_len = 0; diff --git a/trunk/drivers/scsi/scsi_lib.c b/trunk/drivers/scsi/scsi_lib.c index b12fb310e399..7c4c889c5221 100644 --- a/trunk/drivers/scsi/scsi_lib.c +++ b/trunk/drivers/scsi/scsi_lib.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include @@ -35,6 +34,13 @@ #define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools) #define SG_MEMPOOL_SIZE 2 +/* + * The maximum number of SG segments that we will put inside a scatterlist + * (unless chaining is used). Should ideally fit inside a single page, to + * avoid a higher order allocation. + */ +#define SCSI_MAX_SG_SEGMENTS 128 + struct scsi_host_sg_pool { size_t size; char *name; @@ -42,31 +48,22 @@ struct scsi_host_sg_pool { mempool_t *pool; }; -#define SP(x) { x, "sgpool-" __stringify(x) } -#if (SCSI_MAX_SG_SEGMENTS < 32) -#error SCSI_MAX_SG_SEGMENTS is too small (must be 32 or greater) -#endif +#define SP(x) { x, "sgpool-" #x } static struct scsi_host_sg_pool scsi_sg_pools[] = { SP(8), SP(16), -#if (SCSI_MAX_SG_SEGMENTS > 32) +#if (SCSI_MAX_SG_SEGMENTS > 16) SP(32), -#if (SCSI_MAX_SG_SEGMENTS > 64) +#if (SCSI_MAX_SG_SEGMENTS > 32) SP(64), -#if (SCSI_MAX_SG_SEGMENTS > 128) +#if (SCSI_MAX_SG_SEGMENTS > 64) SP(128), -#if (SCSI_MAX_SG_SEGMENTS > 256) -#error SCSI_MAX_SG_SEGMENTS is too large (256 MAX) -#endif #endif #endif #endif - SP(SCSI_MAX_SG_SEGMENTS) }; #undef SP -static struct kmem_cache *scsi_bidi_sdb_cache; - static void scsi_run_queue(struct request_queue *q); /* @@ -443,7 +440,7 @@ EXPORT_SYMBOL_GPL(scsi_execute_async); static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) { cmd->serial_number = 0; - scsi_set_resid(cmd, 0); + cmd->resid = 0; memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); if (cmd->cmd_len == 0) cmd->cmd_len = COMMAND_SIZE(cmd->cmnd[0]); @@ -693,16 +690,42 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, return NULL; } +/* + * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit + * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. + */ +#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 + static inline unsigned int scsi_sgtable_index(unsigned short nents) { unsigned int index; - BUG_ON(nents > SCSI_MAX_SG_SEGMENTS); - - if (nents <= 8) + switch (nents) { + case 1 ... 8: index = 0; - else - index = get_count_order(nents) - 3; + break; + case 9 ... 16: + index = 1; + break; +#if (SCSI_MAX_SG_SEGMENTS > 16) + case 17 ... 32: + index = 2; + break; +#if (SCSI_MAX_SG_SEGMENTS > 32) + case 33 ... 64: + index = 3; + break; +#if (SCSI_MAX_SG_SEGMENTS > 64) + case 65 ... 128: + index = 4; + break; +#endif +#endif +#endif + default: + printk(KERN_ERR "scsi: bad segment count=%d\n", nents); + BUG(); + } return index; } @@ -723,27 +746,31 @@ static struct scatterlist *scsi_sg_alloc(unsigned int nents, gfp_t gfp_mask) return mempool_alloc(sgp->pool, gfp_mask); } -static int scsi_alloc_sgtable(struct scsi_data_buffer *sdb, int nents, - gfp_t gfp_mask) +int scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask) { int ret; - BUG_ON(!nents); + BUG_ON(!cmd->use_sg); - ret = __sg_alloc_table(&sdb->table, nents, SCSI_MAX_SG_SEGMENTS, - gfp_mask, scsi_sg_alloc); + ret = __sg_alloc_table(&cmd->sg_table, cmd->use_sg, + SCSI_MAX_SG_SEGMENTS, gfp_mask, scsi_sg_alloc); if (unlikely(ret)) - __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, + __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); + cmd->request_buffer = cmd->sg_table.sgl; return ret; } -static void scsi_free_sgtable(struct scsi_data_buffer *sdb) +EXPORT_SYMBOL(scsi_alloc_sgtable); + +void scsi_free_sgtable(struct scsi_cmnd *cmd) { - __sg_free_table(&sdb->table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); + __sg_free_table(&cmd->sg_table, SCSI_MAX_SG_SEGMENTS, scsi_sg_free); } +EXPORT_SYMBOL(scsi_free_sgtable); + /* * Function: scsi_release_buffers() * @@ -761,49 +788,17 @@ static void scsi_free_sgtable(struct scsi_data_buffer *sdb) * the scatter-gather table, and potentially any bounce * buffers. */ -void scsi_release_buffers(struct scsi_cmnd *cmd) -{ - if (cmd->sdb.table.nents) - scsi_free_sgtable(&cmd->sdb); - - memset(&cmd->sdb, 0, sizeof(cmd->sdb)); - - if (scsi_bidi_cmnd(cmd)) { - struct scsi_data_buffer *bidi_sdb = - cmd->request->next_rq->special; - scsi_free_sgtable(bidi_sdb); - kmem_cache_free(scsi_bidi_sdb_cache, bidi_sdb); - cmd->request->next_rq->special = NULL; - } -} -EXPORT_SYMBOL(scsi_release_buffers); - -/* - * Bidi commands Must be complete as a whole, both sides at once. - * If part of the bytes were written and lld returned - * scsi_in()->resid and/or scsi_out()->resid this information will be left - * in req->data_len and req->next_rq->data_len. The upper-layer driver can - * decide what to do with this information. - */ -void scsi_end_bidi_request(struct scsi_cmnd *cmd) +static void scsi_release_buffers(struct scsi_cmnd *cmd) { - struct request *req = cmd->request; - unsigned int dlen = req->data_len; - unsigned int next_dlen = req->next_rq->data_len; - - req->data_len = scsi_out(cmd)->resid; - req->next_rq->data_len = scsi_in(cmd)->resid; - - /* The req and req->next_rq have not been completed */ - BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen)); - - scsi_release_buffers(cmd); + if (cmd->use_sg) + scsi_free_sgtable(cmd); /* - * This will goose the queue request function at the end, so we don't - * need to worry about launching another command. + * Zero these out. They now point to freed memory, and it is + * dangerous to hang onto the pointers. */ - scsi_next_command(cmd); + cmd->request_buffer = NULL; + cmd->request_bufflen = 0; } /* @@ -837,7 +832,7 @@ void scsi_end_bidi_request(struct scsi_cmnd *cmd) void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; - int this_count = scsi_bufflen(cmd); + int this_count = cmd->request_bufflen; struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; int clear_errors = 1; @@ -845,6 +840,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) int sense_valid = 0; int sense_deferred = 0; + scsi_release_buffers(cmd); + if (result) { sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) @@ -867,17 +864,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) req->sense_len = len; } } - if (scsi_bidi_cmnd(cmd)) { - /* will also release_buffers */ - scsi_end_bidi_request(cmd); - return; - } - req->data_len = scsi_get_resid(cmd); + req->data_len = cmd->resid; } - BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */ - scsi_release_buffers(cmd); - /* * Next deal with any sectors which we were able to correctly * handle. @@ -885,6 +874,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " "%d bytes done.\n", req->nr_sectors, good_bytes)); + SCSI_LOG_HLCOMPLETE(1, printk("use_sg is %d\n", cmd->use_sg)); if (clear_errors) req->errors = 0; @@ -1001,80 +991,52 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_end_request(cmd, -EIO, this_count, !result); } -static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, - gfp_t gfp_mask) +/* + * Function: scsi_init_io() + * + * Purpose: SCSI I/O initialize function. + * + * Arguments: cmd - Command descriptor we wish to initialize + * + * Returns: 0 on success + * BLKPREP_DEFER if the failure is retryable + */ +static int scsi_init_io(struct scsi_cmnd *cmd) { - int count; + struct request *req = cmd->request; + int count; + + /* + * We used to not use scatter-gather for single segment request, + * but now we do (it makes highmem I/O easier to support without + * kmapping pages) + */ + cmd->use_sg = req->nr_phys_segments; /* * If sg table allocation fails, requeue request later. */ - if (unlikely(scsi_alloc_sgtable(sdb, req->nr_phys_segments, - gfp_mask))) { + if (unlikely(scsi_alloc_sgtable(cmd, GFP_ATOMIC))) { + scsi_unprep_request(req); return BLKPREP_DEFER; } req->buffer = NULL; if (blk_pc_request(req)) - sdb->length = req->data_len; + cmd->request_bufflen = req->data_len; else - sdb->length = req->nr_sectors << 9; + cmd->request_bufflen = req->nr_sectors << 9; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = blk_rq_map_sg(req->q, req, sdb->table.sgl); - BUG_ON(count > sdb->table.nents); - sdb->table.nents = count; + count = blk_rq_map_sg(req->q, req, cmd->request_buffer); + BUG_ON(count > cmd->use_sg); + cmd->use_sg = count; return BLKPREP_OK; } -/* - * Function: scsi_init_io() - * - * Purpose: SCSI I/O initialize function. - * - * Arguments: cmd - Command descriptor we wish to initialize - * - * Returns: 0 on success - * BLKPREP_DEFER if the failure is retryable - * BLKPREP_KILL if the failure is fatal - */ -int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) -{ - int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); - if (error) - goto err_exit; - - if (blk_bidi_rq(cmd->request)) { - struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( - scsi_bidi_sdb_cache, GFP_ATOMIC); - if (!bidi_sdb) { - error = BLKPREP_DEFER; - goto err_exit; - } - - cmd->request->next_rq->special = bidi_sdb; - error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, - GFP_ATOMIC); - if (error) - goto err_exit; - } - - return BLKPREP_OK ; - -err_exit: - scsi_release_buffers(cmd); - if (error == BLKPREP_KILL) - scsi_put_command(cmd); - else /* BLKPREP_DEFER */ - scsi_unprep_request(cmd->request); - - return error; -} -EXPORT_SYMBOL(scsi_init_io); - static struct scsi_cmnd *scsi_get_cmd_from_req(struct scsi_device *sdev, struct request *req) { @@ -1119,14 +1081,16 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) BUG_ON(!req->nr_phys_segments); - ret = scsi_init_io(cmd, GFP_ATOMIC); + ret = scsi_init_io(cmd); if (unlikely(ret)) return ret; } else { BUG_ON(req->data_len); BUG_ON(req->data); - memset(&cmd->sdb, 0, sizeof(cmd->sdb)); + cmd->request_bufflen = 0; + cmd->request_buffer = NULL; + cmd->use_sg = 0; req->buffer = NULL; } @@ -1168,7 +1132,7 @@ int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) if (unlikely(!cmd)) return BLKPREP_DEFER; - return scsi_init_io(cmd, GFP_ATOMIC); + return scsi_init_io(cmd); } EXPORT_SYMBOL(scsi_setup_fs_cmnd); @@ -1578,7 +1542,20 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, * this limit is imposed by hardware restrictions */ blk_queue_max_hw_segments(q, shost->sg_tablesize); - blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); + + /* + * In the future, sg chaining support will be mandatory and this + * ifdef can then go away. Right now we don't have all archs + * converted, so better keep it safe. + */ +#ifdef ARCH_HAS_SG_CHAIN + if (shost->use_sg_chaining) + blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS); + else + blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); +#else + blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS); +#endif blk_queue_max_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); @@ -1677,14 +1654,6 @@ int __init scsi_init_queue(void) return -ENOMEM; } - scsi_bidi_sdb_cache = kmem_cache_create("scsi_bidi_sdb", - sizeof(struct scsi_data_buffer), - 0, 0, NULL); - if (!scsi_bidi_sdb_cache) { - printk(KERN_ERR "SCSI: can't init scsi bidi sdb cache\n"); - goto cleanup_io_context; - } - for (i = 0; i < SG_MEMPOOL_NR; i++) { struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; int size = sgp->size * sizeof(struct scatterlist); @@ -1694,7 +1663,6 @@ int __init scsi_init_queue(void) if (!sgp->slab) { printk(KERN_ERR "SCSI: can't init sg slab %s\n", sgp->name); - goto cleanup_bidi_sdb; } sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE, @@ -1702,25 +1670,10 @@ int __init scsi_init_queue(void) if (!sgp->pool) { printk(KERN_ERR "SCSI: can't init sg mempool %s\n", sgp->name); - goto cleanup_bidi_sdb; } } return 0; - -cleanup_bidi_sdb: - for (i = 0; i < SG_MEMPOOL_NR; i++) { - struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; - if (sgp->pool) - mempool_destroy(sgp->pool); - if (sgp->slab) - kmem_cache_destroy(sgp->slab); - } - kmem_cache_destroy(scsi_bidi_sdb_cache); -cleanup_io_context: - kmem_cache_destroy(scsi_io_context_cache); - - return -ENOMEM; } void scsi_exit_queue(void) @@ -1728,7 +1681,6 @@ void scsi_exit_queue(void) int i; kmem_cache_destroy(scsi_io_context_cache); - kmem_cache_destroy(scsi_bidi_sdb_cache); for (i = 0; i < SG_MEMPOOL_NR; i++) { struct scsi_host_sg_pool *sgp = scsi_sg_pools + i; diff --git a/trunk/drivers/scsi/scsi_tgt_lib.c b/trunk/drivers/scsi/scsi_tgt_lib.c index 91630baea532..01e03f3f6ffa 100644 --- a/trunk/drivers/scsi/scsi_tgt_lib.c +++ b/trunk/drivers/scsi/scsi_tgt_lib.c @@ -331,7 +331,8 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd) scsi_tgt_uspace_send_status(cmd, tcmd->itn_id, tcmd->tag); - scsi_release_buffers(cmd); + if (scsi_sglist(cmd)) + scsi_free_sgtable(cmd); queue_work(scsi_tgtd, &tcmd->work); } @@ -352,6 +353,25 @@ static int scsi_tgt_transfer_response(struct scsi_cmnd *cmd) return 0; } +static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask) +{ + struct request *rq = cmd->request; + int count; + + cmd->use_sg = rq->nr_phys_segments; + if (scsi_alloc_sgtable(cmd, gfp_mask)) + return -ENOMEM; + + cmd->request_bufflen = rq->data_len; + + dprintk("cmd %p cnt %d %lu\n", cmd, scsi_sg_count(cmd), + rq_data_dir(rq)); + count = blk_rq_map_sg(rq->q, rq, scsi_sglist(cmd)); + BUG_ON(count > cmd->use_sg); + cmd->use_sg = count; + return 0; +} + /* TODO: test this crap and replace bio_map_user with new interface maybe */ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, unsigned long uaddr, unsigned int len, int rw) @@ -377,11 +397,9 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, } tcmd->bio = rq->bio; - err = scsi_init_io(cmd, GFP_KERNEL); - if (err) { - scsi_release_buffers(cmd); + err = scsi_tgt_init_cmd(cmd, GFP_KERNEL); + if (err) goto unmap_rq; - } return 0; diff --git a/trunk/drivers/scsi/sd.c b/trunk/drivers/scsi/sd.c index 51a5557f42dd..24eba3118b5a 100644 --- a/trunk/drivers/scsi/sd.c +++ b/trunk/drivers/scsi/sd.c @@ -519,7 +519,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) SCpnt->cmnd[4] = (unsigned char) this_count; SCpnt->cmnd[5] = 0; } - SCpnt->sdb.length = this_count * sdp->sector_size; + SCpnt->request_bufflen = this_count * sdp->sector_size; /* * We shouldn't disconnect in the middle of a sector, so with a dumb @@ -926,7 +926,7 @@ static struct block_device_operations sd_fops = { static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; - unsigned int xfer_size = scsi_bufflen(SCpnt); + unsigned int xfer_size = SCpnt->request_bufflen; unsigned int good_bytes = result ? 0 : xfer_size; u64 start_lba = SCpnt->request->sector; u64 bad_lba; diff --git a/trunk/drivers/scsi/sgiwd93.c b/trunk/drivers/scsi/sgiwd93.c index 26cfc56c7091..d4ebe8c67ba9 100644 --- a/trunk/drivers/scsi/sgiwd93.c +++ b/trunk/drivers/scsi/sgiwd93.c @@ -33,9 +33,10 @@ struct ip22_hostdata { struct WD33C93_hostdata wh; - dma_addr_t dma; - void *cpu; - struct device *dev; + struct hpc_data { + dma_addr_t dma; + void *cpu; + } hd; }; #define host_to_hostdata(host) ((struct ip22_hostdata *)((host)->hostdata)) @@ -45,11 +46,6 @@ struct hpc_chunk { u32 _padding; /* align to quadword boundary */ }; -/* space for hpc dma descriptors */ -#define HPC_DMA_SIZE PAGE_SIZE - -#define DMA_DIR(d) ((d == DATA_OUT_DIR) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) - static irqreturn_t sgiwd93_intr(int irq, void *dev_id) { struct Scsi_Host * host = dev_id; @@ -63,17 +59,15 @@ static irqreturn_t sgiwd93_intr(int irq, void *dev_id) } static inline -void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din) +void fill_hpc_entries(struct hpc_chunk *hcp, struct scsi_cmnd *cmd, int datainp) { unsigned long len = cmd->SCp.this_residual; void *addr = cmd->SCp.ptr; dma_addr_t physaddr; unsigned long count; - struct hpc_chunk *hcp; - physaddr = dma_map_single(hd->dev, addr, len, DMA_DIR(din)); + physaddr = dma_map_single(NULL, addr, len, cmd->sc_data_direction); cmd->SCp.dma_handle = physaddr; - hcp = hd->cpu; while (len) { /* @@ -95,9 +89,6 @@ void fill_hpc_entries(struct ip22_hostdata *hd, struct scsi_cmnd *cmd, int din) */ hcp->desc.pbuf = 0; hcp->desc.cntinfo = HPCDMA_EOX; - dma_cache_sync(hd->dev, hd->cpu, - (unsigned long)(hcp + 1) - (unsigned long)hd->cpu, - DMA_TO_DEVICE); } static int dma_setup(struct scsi_cmnd *cmd, int datainp) @@ -105,8 +96,9 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) struct ip22_hostdata *hdata = host_to_hostdata(cmd->device->host); struct hpc3_scsiregs *hregs = (struct hpc3_scsiregs *) cmd->device->host->base; + struct hpc_chunk *hcp = (struct hpc_chunk *) hdata->hd.cpu; - pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hdata->cpu); + pr_debug("dma_setup: datainp<%d> hcp<%p> ", datainp, hcp); hdata->wh.dma_dir = datainp; @@ -119,12 +111,12 @@ static int dma_setup(struct scsi_cmnd *cmd, int datainp) if (cmd->SCp.ptr == NULL || cmd->SCp.this_residual == 0) return 1; - fill_hpc_entries(hdata, cmd, datainp); + fill_hpc_entries(hcp, cmd, datainp); pr_debug(" HPCGO\n"); /* Start up the HPC. */ - hregs->ndptr = hdata->dma; + hregs->ndptr = hdata->hd.dma; if (datainp) hregs->ctrl = HPC3_SCTRL_ACTIVE; else @@ -142,9 +134,6 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, if (!SCpnt) return; - if (SCpnt->SCp.ptr == NULL || SCpnt->SCp.this_residual == 0) - return; - hregs = (struct hpc3_scsiregs *) SCpnt->device->host->base; pr_debug("dma_stop: status<%d> ", status); @@ -156,9 +145,8 @@ static void dma_stop(struct Scsi_Host *instance, struct scsi_cmnd *SCpnt, barrier(); } hregs->ctrl = 0; - dma_unmap_single(hdata->dev, SCpnt->SCp.dma_handle, - SCpnt->SCp.this_residual, - DMA_DIR(hdata->wh.dma_dir)); + dma_unmap_single(NULL, SCpnt->SCp.dma_handle, SCpnt->SCp.this_residual, + SCpnt->sc_data_direction); pr_debug("\n"); } @@ -173,23 +161,22 @@ void sgiwd93_reset(unsigned long base) } EXPORT_SYMBOL_GPL(sgiwd93_reset); -static inline void init_hpc_chain(struct ip22_hostdata *hdata) +static inline void init_hpc_chain(struct hpc_data *hd) { - struct hpc_chunk *hcp = (struct hpc_chunk *)hdata->cpu; - dma_addr_t dma = hdata->dma; + struct hpc_chunk *hcp = (struct hpc_chunk *) hd->cpu; + struct hpc_chunk *dma = (struct hpc_chunk *) hd->dma; unsigned long start, end; start = (unsigned long) hcp; - end = start + HPC_DMA_SIZE; + end = start + PAGE_SIZE; while (start < end) { - hcp->desc.pnext = (u32) (dma + sizeof(struct hpc_chunk)); + hcp->desc.pnext = (u32) (dma + 1); hcp->desc.cntinfo = HPCDMA_EOX; - hcp++; - dma += sizeof(struct hpc_chunk); + hcp++; dma++; start += sizeof(struct hpc_chunk); }; hcp--; - hcp->desc.pnext = hdata->dma; + hcp->desc.pnext = hd->dma; } static int sgiwd93_bus_reset(struct scsi_cmnd *cmd) @@ -248,17 +235,16 @@ static int __init sgiwd93_probe(struct platform_device *pdev) host->irq = irq; hdata = host_to_hostdata(host); - hdata->dev = &pdev->dev; - hdata->cpu = dma_alloc_noncoherent(&pdev->dev, HPC_DMA_SIZE, - &hdata->dma, GFP_KERNEL); - if (!hdata->cpu) { + hdata->hd.cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, + &hdata->hd.dma, GFP_KERNEL); + if (!hdata->hd.cpu) { printk(KERN_WARNING "sgiwd93: Could not allocate memory for " "host %d buffer.\n", unit); err = -ENOMEM; goto out_put; } - init_hpc_chain(hdata); + init_hpc_chain(&hdata->hd); regs.SASR = wdregs + 3; regs.SCMD = wdregs + 7; @@ -288,7 +274,7 @@ static int __init sgiwd93_probe(struct platform_device *pdev) out_irq: free_irq(irq, host); out_free: - dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); + dma_free_coherent(NULL, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); out_put: scsi_host_put(host); out: @@ -304,7 +290,7 @@ static void __exit sgiwd93_remove(struct platform_device *pdev) scsi_remove_host(host); free_irq(pd->irq, host); - dma_free_noncoherent(&pdev->dev, HPC_DMA_SIZE, hdata->cpu, hdata->dma); + dma_free_coherent(&pdev->dev, PAGE_SIZE, hdata->hd.cpu, hdata->hd.dma); scsi_host_put(host); } diff --git a/trunk/drivers/scsi/sr.c b/trunk/drivers/scsi/sr.c index 50ba49250203..1fcee16fa36d 100644 --- a/trunk/drivers/scsi/sr.c +++ b/trunk/drivers/scsi/sr.c @@ -231,7 +231,7 @@ static int sr_media_change(struct cdrom_device_info *cdi, int slot) static int sr_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; - int this_count = scsi_bufflen(SCpnt); + int this_count = SCpnt->request_bufflen; int good_bytes = (result == 0 ? this_count : 0); int block_sectors = 0; long error_sector; @@ -379,18 +379,17 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) } { - struct scatterlist *sg; - int i, size = 0, sg_count = scsi_sg_count(SCpnt); + struct scatterlist *sg = SCpnt->request_buffer; + int i, size = 0; + for (i = 0; i < SCpnt->use_sg; i++) + size += sg[i].length; - scsi_for_each_sg(SCpnt, sg, sg_count, i) - size += sg->length; - - if (size != scsi_bufflen(SCpnt)) { + if (size != SCpnt->request_bufflen && SCpnt->use_sg) { scmd_printk(KERN_ERR, SCpnt, "mismatch count %d, bytes %d\n", - size, scsi_bufflen(SCpnt)); - if (scsi_bufflen(SCpnt) > size) - SCpnt->sdb.length = size; + size, SCpnt->request_bufflen); + if (SCpnt->request_bufflen > size) + SCpnt->request_bufflen = size; } } @@ -398,12 +397,12 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) * request doesn't start on hw block boundary, add scatter pads */ if (((unsigned int)rq->sector % (s_size >> 9)) || - (scsi_bufflen(SCpnt) % s_size)) { + (SCpnt->request_bufflen % s_size)) { scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); goto out; } - this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9); + this_count = (SCpnt->request_bufflen >> 9) / (s_size >> 9); SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", @@ -417,7 +416,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) if (this_count > 0xffff) { this_count = 0xffff; - SCpnt->sdb.length = this_count * s_size; + SCpnt->request_bufflen = this_count * s_size; } SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; diff --git a/trunk/drivers/scsi/stex.c b/trunk/drivers/scsi/stex.c index 72f6d8015358..e3fab3a6aed7 100644 --- a/trunk/drivers/scsi/stex.c +++ b/trunk/drivers/scsi/stex.c @@ -1123,6 +1123,7 @@ static struct scsi_host_template driver_template = { .this_id = -1, .sg_tablesize = ST_MAX_SG, .cmd_per_lun = ST_CMD_PER_LUN, + .use_sg_chaining = ENABLE_SG_CHAINING, }; static int stex_set_dma_mask(struct pci_dev * pdev) diff --git a/trunk/drivers/scsi/sym53c416.c b/trunk/drivers/scsi/sym53c416.c index 6325901e5093..1f6fd1680335 100644 --- a/trunk/drivers/scsi/sym53c416.c +++ b/trunk/drivers/scsi/sym53c416.c @@ -840,5 +840,6 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = 1, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c b/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c index d39107b7669b..21e926dcdab0 100644 --- a/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c +++ b/trunk/drivers/scsi/sym53c8xx_2/sym_glue.c @@ -207,7 +207,7 @@ void sym_set_cam_result_error(struct sym_hcb *np, struct sym_ccb *cp, int resid) /* * Bounce back the sense data to user. */ - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); + memset(&cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); memcpy(cmd->sense_buffer, cp->sns_bbuf, min(SCSI_SENSE_BUFFERSIZE, SYM_SNS_BBUF_LEN)); #if 0 @@ -1681,6 +1681,7 @@ static struct scsi_host_template sym2_template = { .eh_host_reset_handler = sym53c8xx_eh_host_reset_handler, .this_id = 7, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, .max_sectors = 0xFFFF, #ifdef SYM_LINUX_PROC_INFO_SUPPORT .proc_info = sym53c8xx_proc_info, diff --git a/trunk/drivers/scsi/u14-34f.c b/trunk/drivers/scsi/u14-34f.c index 662c00451be4..4bc5407f9695 100644 --- a/trunk/drivers/scsi/u14-34f.c +++ b/trunk/drivers/scsi/u14-34f.c @@ -451,6 +451,7 @@ static struct scsi_host_template driver_template = { .this_id = 7, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #if !defined(__BIG_ENDIAN_BITFIELD) && !defined(__LITTLE_ENDIAN_BITFIELD) diff --git a/trunk/drivers/scsi/ultrastor.c b/trunk/drivers/scsi/ultrastor.c index f385dce8dfbe..75eca6b22db5 100644 --- a/trunk/drivers/scsi/ultrastor.c +++ b/trunk/drivers/scsi/ultrastor.c @@ -1204,5 +1204,6 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = ULTRASTOR_MAX_CMDS_PER_LUN, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/trunk/drivers/scsi/wd7000.c b/trunk/drivers/scsi/wd7000.c index c975c01b3a02..b4304ae78527 100644 --- a/trunk/drivers/scsi/wd7000.c +++ b/trunk/drivers/scsi/wd7000.c @@ -1671,6 +1671,7 @@ static struct scsi_host_template driver_template = { .cmd_per_lun = 1, .unchecked_isa_dma = 1, .use_clustering = ENABLE_CLUSTERING, + .use_sg_chaining = ENABLE_SG_CHAINING, }; #include "scsi_module.c" diff --git a/trunk/drivers/usb/storage/isd200.c b/trunk/drivers/usb/storage/isd200.c index 0db488624ab1..178e8c2a8a2f 100644 --- a/trunk/drivers/usb/storage/isd200.c +++ b/trunk/drivers/usb/storage/isd200.c @@ -415,14 +415,14 @@ static void isd200_set_srb(struct isd200_info *info, sg_init_one(&info->sg, buff, bufflen); srb->sc_data_direction = dir; - srb->sdb.table.sgl = buff ? &info->sg : NULL; - srb->sdb.length = bufflen; - srb->sdb.table.nents = buff ? 1 : 0; + srb->request_buffer = buff ? &info->sg : NULL; + srb->request_bufflen = bufflen; + srb->use_sg = buff ? 1 : 0; } static void isd200_srb_set_bufflen(struct scsi_cmnd *srb, unsigned bufflen) { - srb->sdb.length = bufflen; + srb->request_bufflen = bufflen; } diff --git a/trunk/drivers/watchdog/Kconfig b/trunk/drivers/watchdog/Kconfig index afcdc69e37d6..899fc13d0612 100644 --- a/trunk/drivers/watchdog/Kconfig +++ b/trunk/drivers/watchdog/Kconfig @@ -609,7 +609,7 @@ config SBC_EPX_C3_WATCHDOG config INDYDOG tristate "Indy/I2 Hardware Watchdog" - depends on SGI_HAS_INDYDOG + depends on SGI_IP22 help Hardware driver for the Indy's/I2's watchdog. This is a watchdog timer that will reboot the machine after a 60 second diff --git a/trunk/fs/dlm/dir.c b/trunk/fs/dlm/dir.c index ff97ba924333..46754553fdcc 100644 --- a/trunk/fs/dlm/dir.c +++ b/trunk/fs/dlm/dir.c @@ -49,7 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) spin_unlock(&ls->ls_recover_list_lock); if (!found) - de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL); + de = allocate_direntry(ls, len); return de; } @@ -62,7 +62,7 @@ void dlm_clear_free_entries(struct dlm_ls *ls) de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, list); list_del(&de->list); - kfree(de); + free_direntry(de); } spin_unlock(&ls->ls_recover_list_lock); } @@ -171,7 +171,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen } list_del(&de->list); - kfree(de); + free_direntry(de); out: write_unlock(&ls->ls_dirtbl[bucket].lock); } @@ -302,7 +302,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, write_unlock(&ls->ls_dirtbl[bucket].lock); - de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL); + de = allocate_direntry(ls, namelen); if (!de) return -ENOMEM; @@ -313,7 +313,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name, write_lock(&ls->ls_dirtbl[bucket].lock); tmp = search_bucket(ls, name, namelen, bucket); if (tmp) { - kfree(de); + free_direntry(de); de = tmp; } else { list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); @@ -329,48 +329,50 @@ int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, return get_entry(ls, nodeid, name, namelen, r_nodeid); } -static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) -{ - struct dlm_rsb *r; - - down_read(&ls->ls_root_sem); - list_for_each_entry(r, &ls->ls_root_list, res_root_list) { - if (len == r->res_length && !memcmp(name, r->res_name, len)) { - up_read(&ls->ls_root_sem); - return r; - } - } - up_read(&ls->ls_root_sem); - return NULL; -} - -/* Find the rsb where we left off (or start again), then send rsb names - for rsb's we're master of and whose directory node matches the requesting - node. inbuf is the rsb name last sent, inlen is the name's length */ +/* Copy the names of master rsb's into the buffer provided. + Only select names whose dir node is the given nodeid. */ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, char *outbuf, int outlen, int nodeid) { struct list_head *list; - struct dlm_rsb *r; - int offset = 0, dir_nodeid; + struct dlm_rsb *start_r = NULL, *r = NULL; + int offset = 0, start_namelen, error, dir_nodeid; + char *start_name; uint16_t be_namelen; - down_read(&ls->ls_root_sem); + /* + * Find the rsb where we left off (or start again) + */ - if (inlen > 1) { - r = find_rsb_root(ls, inbuf, inlen); - if (!r) { - inbuf[inlen - 1] = '\0'; - log_error(ls, "copy_master_names from %d start %d %s", - nodeid, inlen, inbuf); - goto out; - } - list = r->res_root_list.next; - } else { - list = ls->ls_root_list.next; + start_namelen = inlen; + start_name = inbuf; + + if (start_namelen > 1) { + /* + * We could also use a find_rsb_root() function here that + * searched the ls_root_list. + */ + error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER, + &start_r); + DLM_ASSERT(!error && start_r, + printk("error %d\n", error);); + DLM_ASSERT(!list_empty(&start_r->res_root_list), + dlm_print_rsb(start_r);); + dlm_put_rsb(start_r); } + /* + * Send rsb names for rsb's we're master of and whose directory node + * matches the requesting node. + */ + + down_read(&ls->ls_root_sem); + if (start_r) + list = start_r->res_root_list.next; + else + list = ls->ls_root_list.next; + for (offset = 0; list != &ls->ls_root_list; list = list->next) { r = list_entry(list, struct dlm_rsb, res_root_list); if (r->res_nodeid) diff --git a/trunk/fs/dlm/dlm_internal.h b/trunk/fs/dlm/dlm_internal.h index ec61bbaf25df..d2fc2384c3be 100644 --- a/trunk/fs/dlm/dlm_internal.h +++ b/trunk/fs/dlm/dlm_internal.h @@ -570,21 +570,5 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; } -int dlm_netlink_init(void); -void dlm_netlink_exit(void); -void dlm_timeout_warn(struct dlm_lkb *lkb); - -#ifdef CONFIG_DLM_DEBUG -int dlm_register_debugfs(void); -void dlm_unregister_debugfs(void); -int dlm_create_debug_file(struct dlm_ls *ls); -void dlm_delete_debug_file(struct dlm_ls *ls); -#else -static inline int dlm_register_debugfs(void) { return 0; } -static inline void dlm_unregister_debugfs(void) { } -static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } -static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } -#endif - #endif /* __DLM_INTERNAL_DOT_H__ */ diff --git a/trunk/fs/dlm/lock.c b/trunk/fs/dlm/lock.c index ff4a198fa677..3915b8e14146 100644 --- a/trunk/fs/dlm/lock.c +++ b/trunk/fs/dlm/lock.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -88,6 +88,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, static int receive_extralen(struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); static void del_timeout(struct dlm_lkb *lkb); +void dlm_timeout_warn(struct dlm_lkb *lkb); /* * Lock compatibilty matrix - thanks Steve @@ -334,7 +335,7 @@ static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) { struct dlm_rsb *r; - r = dlm_allocate_rsb(ls, len); + r = allocate_rsb(ls, len); if (!r) return NULL; @@ -477,7 +478,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); if (!error) { write_unlock(&ls->ls_rsbtbl[bucket].lock); - dlm_free_rsb(r); + free_rsb(r); r = tmp; goto out; } @@ -489,6 +490,12 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen, return error; } +int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, + unsigned int flags, struct dlm_rsb **r_ret) +{ + return find_rsb(ls, name, namelen, flags, r_ret); +} + /* This is only called to add a reference when the code already holds a valid reference to the rsb, so there's no need for locking. */ @@ -512,7 +519,7 @@ static void toss_rsb(struct kref *kref) list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; if (r->res_lvbptr) { - dlm_free_lvb(r->res_lvbptr); + free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; } } @@ -582,7 +589,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) uint32_t lkid = 0; uint16_t bucket; - lkb = dlm_allocate_lkb(ls); + lkb = allocate_lkb(ls); if (!lkb) return -ENOMEM; @@ -676,8 +683,8 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb) /* for local/process lkbs, lvbptr points to caller's lksb */ if (lkb->lkb_lvbptr && is_master_copy(lkb)) - dlm_free_lvb(lkb->lkb_lvbptr); - dlm_free_lkb(lkb); + free_lvb(lkb->lkb_lvbptr); + free_lkb(lkb); return 1; } else { write_unlock(&ls->ls_lkbtbl[bucket].lock); @@ -981,7 +988,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b) if (is_master(r)) dir_remove(r); - dlm_free_rsb(r); + free_rsb(r); count++; } else { write_unlock(&ls->ls_rsbtbl[b].lock); @@ -1164,7 +1171,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) return; if (!r->res_lvbptr) - r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + r->res_lvbptr = allocate_lvb(r->res_ls); if (!r->res_lvbptr) return; @@ -1196,7 +1203,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) return; if (!r->res_lvbptr) - r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + r->res_lvbptr = allocate_lvb(r->res_ls); if (!r->res_lvbptr) return; @@ -1845,7 +1852,7 @@ static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) { struct dlm_ls *ls = r->res_ls; - int i, error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); + int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); @@ -1879,7 +1886,7 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) return 1; } - for (i = 0; i < 2; i++) { + for (;;) { /* It's possible for dlm_scand to remove an old rsb for this same resource from the toss list, us to create a new one, look up the master locally, and find it @@ -1893,8 +1900,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) log_debug(ls, "dir_lookup error %d %s", error, r->res_name); schedule(); } - if (error && error != -EEXIST) - return error; if (ret_nodeid == our_nodeid) { r->res_first_lkid = 0; @@ -1936,11 +1941,8 @@ static void confirm_master(struct dlm_rsb *r, int error) break; case -EAGAIN: - case -EBADR: - case -ENOTBLK: - /* the remote request failed and won't be retried (it was - a NOQUEUE, or has been canceled/unlocked); make a waiting - lkb the first_lkid */ + /* the remote master didn't queue our NOQUEUE request; + make a waiting lkb the first_lkid */ r->res_first_lkid = 0; @@ -2106,18 +2108,17 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) /* an lkb may be waiting for an rsb lookup to complete where the lookup was initiated by another lock */ - if (!list_empty(&lkb->lkb_rsb_lookup)) { - if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { + if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) { + if (!list_empty(&lkb->lkb_rsb_lookup)) { log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id); list_del_init(&lkb->lkb_rsb_lookup); queue_cast(lkb->lkb_resource, lkb, args->flags & DLM_LKF_CANCEL ? -DLM_ECANCEL : -DLM_EUNLOCK); unhold_lkb(lkb); /* undoes create_lkb() */ + rv = -EBUSY; + goto out; } - /* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */ - rv = -EBUSY; - goto out; } /* cancel not allowed with another cancel/unlock in progress */ @@ -2985,7 +2986,7 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (lkb->lkb_exflags & DLM_LKF_VALBLK) { if (!lkb->lkb_lvbptr) - lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + lkb->lkb_lvbptr = allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); @@ -3005,9 +3006,11 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); + DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { /* lkb was just created so there won't be an lvb yet */ - lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + lkb->lkb_lvbptr = allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; } @@ -3018,6 +3021,16 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms) { + if (lkb->lkb_nodeid != ms->m_header.h_nodeid) { + log_error(ls, "convert_args nodeid %d %d lkid %x %x", + lkb->lkb_nodeid, ms->m_header.h_nodeid, + lkb->lkb_id, lkb->lkb_remid); + return -EINVAL; + } + + if (!is_master_copy(lkb)) + return -EINVAL; + if (lkb->lkb_status != DLM_LKSTS_GRANTED) return -EBUSY; @@ -3033,6 +3046,8 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_message *ms) { + if (!is_master_copy(lkb)) + return -EINVAL; if (receive_lvb(ls, lkb, ms)) return -ENOMEM; return 0; @@ -3048,50 +3063,6 @@ static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_remid = ms->m_lkid; } -/* This is called after the rsb is locked so that we can safely inspect - fields in the lkb. */ - -static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) -{ - int from = ms->m_header.h_nodeid; - int error = 0; - - switch (ms->m_type) { - case DLM_MSG_CONVERT: - case DLM_MSG_UNLOCK: - case DLM_MSG_CANCEL: - if (!is_master_copy(lkb) || lkb->lkb_nodeid != from) - error = -EINVAL; - break; - - case DLM_MSG_CONVERT_REPLY: - case DLM_MSG_UNLOCK_REPLY: - case DLM_MSG_CANCEL_REPLY: - case DLM_MSG_GRANT: - case DLM_MSG_BAST: - if (!is_process_copy(lkb) || lkb->lkb_nodeid != from) - error = -EINVAL; - break; - - case DLM_MSG_REQUEST_REPLY: - if (!is_process_copy(lkb)) - error = -EINVAL; - else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from) - error = -EINVAL; - break; - - default: - error = -EINVAL; - } - - if (error) - log_error(lkb->lkb_resource->res_ls, - "ignore invalid message %d from %d %x %x %x %d", - ms->m_type, from, lkb->lkb_id, lkb->lkb_remid, - lkb->lkb_flags, lkb->lkb_nodeid); - return error; -} - static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) { struct dlm_lkb *lkb; @@ -3153,21 +3124,17 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - receive_flags(lkb, ms); error = receive_convert_args(ls, lkb, ms); if (error) - goto out_reply; + goto out; reply = !down_conversion(lkb); error = do_convert(r, lkb); - out_reply: + out: if (reply) send_convert_reply(r, lkb, error); - out: + unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3193,19 +3160,15 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - receive_flags(lkb, ms); error = receive_unlock_args(ls, lkb, ms); if (error) - goto out_reply; + goto out; error = do_unlock(r, lkb); - out_reply: - send_unlock_reply(r, lkb, error); out: + send_unlock_reply(r, lkb, error); + unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3233,13 +3196,9 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - error = do_cancel(r, lkb); send_cancel_reply(r, lkb, error); - out: + unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3258,26 +3217,22 @@ static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_grant from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_grant no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - receive_flags_reply(lkb, ms); if (is_altmode(lkb)) munge_altmode(lkb, ms); grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); - out: + unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3291,22 +3246,18 @@ static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_bast from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_bast no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - queue_bast(r, lkb, ms->m_bastmode); - out: + unlock_rsb(r); put_rsb(r); dlm_put_lkb(lkb); @@ -3372,19 +3323,15 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_request_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_request_reply no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); r = lkb->lkb_resource; hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - mstype = lkb->lkb_wait_type; error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY); if (error) @@ -3436,7 +3383,6 @@ static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) if (is_overlap(lkb)) { /* we'll ignore error in cancel/unlock reply */ queue_cast_overlap(r, lkb); - confirm_master(r, result); unhold_lkb(lkb); /* undoes create_lkb() */ } else _request_lock(r, lkb); @@ -3517,10 +3463,6 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3539,10 +3481,10 @@ static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_convert_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_convert_reply no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_convert_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3556,10 +3498,6 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3591,10 +3529,10 @@ static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_unlock_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_unlock_reply no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_unlock_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3608,10 +3546,6 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) hold_rsb(r); lock_rsb(r); - error = validate_message(lkb, ms); - if (error) - goto out; - /* stub reply can happen with waiters_mutex held */ error = remove_from_waiters_ms(lkb, ms); if (error) @@ -3643,10 +3577,10 @@ static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) error = find_lkb(ls, ms->m_remid, &lkb); if (error) { - log_debug(ls, "receive_cancel_reply from %d no lkb %x", - ms->m_header.h_nodeid, ms->m_remid); + log_error(ls, "receive_cancel_reply no lkb"); return; } + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); _receive_cancel_reply(lkb, ms); dlm_put_lkb(lkb); @@ -3706,13 +3640,6 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms) { - if (!dlm_is_member(ls, ms->m_header.h_nodeid)) { - log_debug(ls, "ignore non-member message %d from %d %x %x %d", - ms->m_type, ms->m_header.h_nodeid, ms->m_lkid, - ms->m_remid, ms->m_result); - return; - } - switch (ms->m_type) { /* messages sent to a master node */ @@ -3851,9 +3778,8 @@ void dlm_receive_buffer(struct dlm_header *hd, int nodeid) ls = dlm_find_lockspace_global(hd->h_lockspace); if (!ls) { - if (dlm_config.ci_log_debug) - log_print("invalid lockspace %x from %d cmd %d type %d", - hd->h_lockspace, nodeid, hd->h_cmd, type); + log_print("invalid h_lockspace %x from %d cmd %d type %d", + hd->h_lockspace, nodeid, hd->h_cmd, type); if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) dlm_send_ls_not_ready(nodeid, rc); @@ -3880,7 +3806,6 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; ls->ls_stub_ms.m_result = -EINPROGRESS; ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_convert_reply(lkb, &ls->ls_stub_ms); /* Same special case as in receive_rcom_lock_args() */ @@ -3922,7 +3847,6 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) void dlm_recover_waiters_pre(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; - int wait_type, stub_unlock_result, stub_cancel_result; mutex_lock(&ls->ls_waiters_mutex); @@ -3941,33 +3865,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) if (!waiter_needs_recovery(ls, lkb)) continue; - wait_type = lkb->lkb_wait_type; - stub_unlock_result = -DLM_EUNLOCK; - stub_cancel_result = -DLM_ECANCEL; - - /* Main reply may have been received leaving a zero wait_type, - but a reply for the overlapping op may not have been - received. In that case we need to fake the appropriate - reply for the overlap op. */ - - if (!wait_type) { - if (is_overlap_cancel(lkb)) { - wait_type = DLM_MSG_CANCEL; - if (lkb->lkb_grmode == DLM_LOCK_IV) - stub_cancel_result = 0; - } - if (is_overlap_unlock(lkb)) { - wait_type = DLM_MSG_UNLOCK; - if (lkb->lkb_grmode == DLM_LOCK_IV) - stub_unlock_result = -ENOENT; - } - - log_debug(ls, "rwpre overlap %x %x %d %d %d", - lkb->lkb_id, lkb->lkb_flags, wait_type, - stub_cancel_result, stub_unlock_result); - } - - switch (wait_type) { + switch (lkb->lkb_wait_type) { case DLM_MSG_REQUEST: lkb->lkb_flags |= DLM_IFL_RESEND; @@ -3980,9 +3878,8 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_UNLOCK: hold_lkb(lkb); ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; - ls->ls_stub_ms.m_result = stub_unlock_result; + ls->ls_stub_ms.m_result = -DLM_EUNLOCK; ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_unlock_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); break; @@ -3990,16 +3887,15 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) case DLM_MSG_CANCEL: hold_lkb(lkb); ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; - ls->ls_stub_ms.m_result = stub_cancel_result; + ls->ls_stub_ms.m_result = -DLM_ECANCEL; ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; _receive_cancel_reply(lkb, &ls->ls_stub_ms); dlm_put_lkb(lkb); break; default: - log_error(ls, "invalid lkb wait_type %d %d", - lkb->lkb_wait_type, wait_type); + log_error(ls, "invalid lkb wait_type %d", + lkb->lkb_wait_type); } schedule(); } @@ -4288,7 +4184,7 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); if (lkb->lkb_exflags & DLM_LKF_VALBLK) { - lkb->lkb_lvbptr = dlm_allocate_lvb(ls); + lkb->lkb_lvbptr = allocate_lvb(ls); if (!lkb->lkb_lvbptr) return -ENOMEM; lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - @@ -4363,7 +4259,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) put_rsb(r); out: if (error) - log_debug(ls, "recover_master_copy %d %x", error, rl->rl_lkid); + log_print("recover_master_copy %d %x", error, rl->rl_lkid); rl->rl_result = error; return error; } @@ -4446,7 +4342,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } } - /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + /* After ua is attached to lkb it will be freed by free_lkb(). When DLM_IFL_USER is set, the dlm knows that this is a userspace lock and that lkb_astparam is the dlm_user_args structure. */ @@ -4783,7 +4679,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) } list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { - lkb->lkb_ast_type = 0; list_del(&lkb->lkb_astqueue); dlm_put_lkb(lkb); } diff --git a/trunk/fs/dlm/lock.h b/trunk/fs/dlm/lock.h index 27b6ed302911..ada04680a1e5 100644 --- a/trunk/fs/dlm/lock.h +++ b/trunk/fs/dlm/lock.h @@ -19,6 +19,8 @@ void dlm_print_lkb(struct dlm_lkb *lkb); void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); void dlm_receive_buffer(struct dlm_header *hd, int nodeid); int dlm_modes_compat(int mode1, int mode2); +int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, + unsigned int flags, struct dlm_rsb **r_ret); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); int dlm_put_lkb(struct dlm_lkb *lkb); diff --git a/trunk/fs/dlm/lockspace.c b/trunk/fs/dlm/lockspace.c index b180fdc51085..5c108c49cb8c 100644 --- a/trunk/fs/dlm/lockspace.c +++ b/trunk/fs/dlm/lockspace.c @@ -24,6 +24,14 @@ #include "recover.h" #include "requestqueue.h" +#ifdef CONFIG_DLM_DEBUG +int dlm_create_debug_file(struct dlm_ls *ls); +void dlm_delete_debug_file(struct dlm_ls *ls); +#else +static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } +static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +#endif + static int ls_count; static struct mutex ls_lock; static struct list_head lslist; @@ -676,9 +684,9 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_del_ast(lkb); if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) - dlm_free_lvb(lkb->lkb_lvbptr); + free_lvb(lkb->lkb_lvbptr); - dlm_free_lkb(lkb); + free_lkb(lkb); } } dlm_astd_resume(); @@ -696,7 +704,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) res_hashchain); list_del(&rsb->res_hashchain); - dlm_free_rsb(rsb); + free_rsb(rsb); } head = &ls->ls_rsbtbl[i].toss; @@ -704,7 +712,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) rsb = list_entry(head->next, struct dlm_rsb, res_hashchain); list_del(&rsb->res_hashchain); - dlm_free_rsb(rsb); + free_rsb(rsb); } } diff --git a/trunk/fs/dlm/lowcomms.c b/trunk/fs/dlm/lowcomms.c index 7c1e5e5cccd8..e9923ca9c2d9 100644 --- a/trunk/fs/dlm/lowcomms.c +++ b/trunk/fs/dlm/lowcomms.c @@ -864,7 +864,7 @@ static void sctp_init_assoc(struct connection *con) static void tcp_connect_to_sock(struct connection *con) { int result = -EHOSTUNREACH; - struct sockaddr_storage saddr, src_addr; + struct sockaddr_storage saddr; int addr_len; struct socket *sock; @@ -898,17 +898,6 @@ static void tcp_connect_to_sock(struct connection *con) con->connect_action = tcp_connect_to_sock; add_sock(sock, con); - /* Bind to our cluster-known address connecting to avoid - routing problems */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); - make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, - addr_len); - if (result < 0) { - log_print("could not bind for connect: %d", result); - /* This *may* not indicate a critical error */ - } - make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); log_print("connecting to %d", con->nodeid); @@ -1437,8 +1426,6 @@ void dlm_lowcomms_stop(void) con = __nodeid2con(i, 0); if (con) { close_connection(con, true); - if (con->othercon) - kmem_cache_free(con_cache, con->othercon); kmem_cache_free(con_cache, con); } } diff --git a/trunk/fs/dlm/main.c b/trunk/fs/dlm/main.c index 58487fb95a4c..eca2907f2386 100644 --- a/trunk/fs/dlm/main.c +++ b/trunk/fs/dlm/main.c @@ -18,6 +18,16 @@ #include "memory.h" #include "config.h" +#ifdef CONFIG_DLM_DEBUG +int dlm_register_debugfs(void); +void dlm_unregister_debugfs(void); +#else +static inline int dlm_register_debugfs(void) { return 0; } +static inline void dlm_unregister_debugfs(void) { } +#endif +int dlm_netlink_init(void); +void dlm_netlink_exit(void); + static int __init init_dlm(void) { int error; diff --git a/trunk/fs/dlm/member.c b/trunk/fs/dlm/member.c index fa17f5a27883..e9cdcab306e2 100644 --- a/trunk/fs/dlm/member.c +++ b/trunk/fs/dlm/member.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -70,7 +70,7 @@ static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) ls->ls_num_nodes--; } -int dlm_is_member(struct dlm_ls *ls, int nodeid) +static int dlm_is_member(struct dlm_ls *ls, int nodeid) { struct dlm_member *memb; diff --git a/trunk/fs/dlm/member.h b/trunk/fs/dlm/member.h index 7a26fca1e0b5..927c08c19214 100644 --- a/trunk/fs/dlm/member.h +++ b/trunk/fs/dlm/member.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -19,7 +19,6 @@ void dlm_clear_members(struct dlm_ls *ls); void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); -int dlm_is_member(struct dlm_ls *ls, int nodeid); #endif /* __MEMBER_DOT_H__ */ diff --git a/trunk/fs/dlm/memory.c b/trunk/fs/dlm/memory.c index f7783867491a..ecf0e5cb2035 100644 --- a/trunk/fs/dlm/memory.c +++ b/trunk/fs/dlm/memory.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -35,7 +35,7 @@ void dlm_memory_exit(void) kmem_cache_destroy(lkb_cache); } -char *dlm_allocate_lvb(struct dlm_ls *ls) +char *allocate_lvb(struct dlm_ls *ls) { char *p; @@ -43,7 +43,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls) return p; } -void dlm_free_lvb(char *p) +void free_lvb(char *p) { kfree(p); } @@ -51,7 +51,7 @@ void dlm_free_lvb(char *p) /* FIXME: have some minimal space built-in to rsb for the name and kmalloc a separate name if needed, like dentries are done */ -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) { struct dlm_rsb *r; @@ -61,14 +61,14 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) return r; } -void dlm_free_rsb(struct dlm_rsb *r) +void free_rsb(struct dlm_rsb *r) { if (r->res_lvbptr) - dlm_free_lvb(r->res_lvbptr); + free_lvb(r->res_lvbptr); kfree(r); } -struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; @@ -76,7 +76,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) return lkb; } -void dlm_free_lkb(struct dlm_lkb *lkb) +void free_lkb(struct dlm_lkb *lkb) { if (lkb->lkb_flags & DLM_IFL_USER) { struct dlm_user_args *ua; @@ -90,3 +90,19 @@ void dlm_free_lkb(struct dlm_lkb *lkb) kmem_cache_free(lkb_cache, lkb); } +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen) +{ + struct dlm_direntry *de; + + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN, + printk("namelen = %d\n", namelen);); + + de = kzalloc(sizeof(*de) + namelen, GFP_KERNEL); + return de; +} + +void free_direntry(struct dlm_direntry *de) +{ + kfree(de); +} + diff --git a/trunk/fs/dlm/memory.h b/trunk/fs/dlm/memory.h index 485fb29143bd..6ead158ccc5c 100644 --- a/trunk/fs/dlm/memory.h +++ b/trunk/fs/dlm/memory.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -16,12 +16,14 @@ int dlm_memory_init(void); void dlm_memory_exit(void); -struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen); -void dlm_free_rsb(struct dlm_rsb *r); -struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls); -void dlm_free_lkb(struct dlm_lkb *l); -char *dlm_allocate_lvb(struct dlm_ls *ls); -void dlm_free_lvb(char *l); +struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); +void free_rsb(struct dlm_rsb *r); +struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); +void free_lkb(struct dlm_lkb *l); +struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); +void free_direntry(struct dlm_direntry *de); +char *allocate_lvb(struct dlm_ls *ls); +void free_lvb(char *l); #endif /* __MEMORY_DOT_H__ */ diff --git a/trunk/fs/dlm/midcomms.c b/trunk/fs/dlm/midcomms.c index e69926e984db..f8c69dda16a0 100644 --- a/trunk/fs/dlm/midcomms.c +++ b/trunk/fs/dlm/midcomms.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -58,12 +58,8 @@ static void copy_from_cb(void *dst, const void *base, unsigned offset, int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, unsigned len, unsigned limit) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - struct dlm_header dlm; - } __tmp; - struct dlm_header *msg = &__tmp.dlm; + unsigned char __tmp[DLM_INBUF_LEN]; + struct dlm_header *msg = (struct dlm_header *) __tmp; int ret = 0; int err = 0; uint16_t msglen; @@ -104,7 +100,8 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, in the buffer on the stack (which should work for most ordinary messages). */ - if (msglen > DLM_INBUF_LEN && msg == &__tmp.dlm) { + if (msglen > sizeof(__tmp) && + msg == (struct dlm_header *) __tmp) { msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); if (msg == NULL) return ret; @@ -122,7 +119,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, dlm_receive_buffer(msg, nodeid); } - if (msg != &__tmp.dlm) + if (msg != (struct dlm_header *) __tmp) kfree(msg); return err ? err : ret; diff --git a/trunk/fs/dlm/rcom.c b/trunk/fs/dlm/rcom.c index 026824cd3acb..ae2fd97fa4ad 100644 --- a/trunk/fs/dlm/rcom.c +++ b/trunk/fs/dlm/rcom.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -197,6 +197,11 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_rcom_spin); } +static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + receive_sync_reply(ls, rc_in); +} + int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; @@ -249,6 +254,11 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(ls, mh, rc); } +static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + receive_sync_reply(ls, rc_in); +} + int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) { struct dlm_rcom *rc; @@ -371,6 +381,11 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(ls, mh, rc); } +static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +{ + dlm_recover_process_copy(ls, rc_in); +} + /* If the lockspace doesn't exist then still send a status message back; it's possible that it just doesn't have its global_id yet. */ @@ -466,11 +481,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_STATUS_REPLY: - receive_sync_reply(ls, rc); + receive_rcom_status_reply(ls, rc); break; case DLM_RCOM_NAMES_REPLY: - receive_sync_reply(ls, rc); + receive_rcom_names_reply(ls, rc); break; case DLM_RCOM_LOOKUP_REPLY: @@ -478,11 +493,11 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) break; case DLM_RCOM_LOCK_REPLY: - dlm_recover_process_copy(ls, rc); + receive_rcom_lock_reply(ls, rc); break; default: - log_error(ls, "receive_rcom bad type %d", rc->rc_type); + DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); } out: return; diff --git a/trunk/fs/dlm/recover.c b/trunk/fs/dlm/recover.c index df075dc300fa..c2cc7694cd16 100644 --- a/trunk/fs/dlm/recover.c +++ b/trunk/fs/dlm/recover.c @@ -629,7 +629,7 @@ static void recover_lvb(struct dlm_rsb *r) goto out; if (!r->res_lvbptr) { - r->res_lvbptr = dlm_allocate_lvb(r->res_ls); + r->res_lvbptr = allocate_lvb(r->res_ls); if (!r->res_lvbptr) goto out; } @@ -731,20 +731,6 @@ int dlm_create_root_list(struct dlm_ls *ls) list_add(&r->res_root_list, &ls->ls_root_list); dlm_hold_rsb(r); } - - /* If we're using a directory, add tossed rsbs to the root - list; they'll have entries created in the new directory, - but no other recovery steps should do anything with them. */ - - if (dlm_no_directory(ls)) { - read_unlock(&ls->ls_rsbtbl[i].lock); - continue; - } - - list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { - list_add(&r->res_root_list, &ls->ls_root_list); - dlm_hold_rsb(r); - } read_unlock(&ls->ls_rsbtbl[i].lock); } out: @@ -764,11 +750,6 @@ void dlm_release_root_list(struct dlm_ls *ls) up_write(&ls->ls_root_sem); } -/* If not using a directory, clear the entire toss list, there's no benefit to - caching the master value since it's fixed. If we are using a dir, keep the - rsb's we're the master of. Recovery will add them to the root list and from - there they'll be entered in the rebuilt directory. */ - void dlm_clear_toss_list(struct dlm_ls *ls) { struct dlm_rsb *r, *safe; @@ -778,10 +759,8 @@ void dlm_clear_toss_list(struct dlm_ls *ls) write_lock(&ls->ls_rsbtbl[i].lock); list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, res_hashchain) { - if (dlm_no_directory(ls) || !is_master(r)) { - list_del(&r->res_hashchain); - dlm_free_rsb(r); - } + list_del(&r->res_hashchain); + free_rsb(r); } write_unlock(&ls->ls_rsbtbl[i].lock); } diff --git a/trunk/fs/dlm/recoverd.c b/trunk/fs/dlm/recoverd.c index 997f9531d594..4b89e20eebe7 100644 --- a/trunk/fs/dlm/recoverd.c +++ b/trunk/fs/dlm/recoverd.c @@ -67,18 +67,17 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_astd_resume(); /* - * Free non-master tossed rsb's. Master rsb's are kept on toss - * list and put on root list to be included in resdir recovery. + * This list of root rsb's will be the basis of most of the recovery + * routines. */ - dlm_clear_toss_list(ls); + dlm_create_root_list(ls); /* - * This list of root rsb's will be the basis of most of the recovery - * routines. + * Free all the tossed rsb's so we don't have to recover them. */ - dlm_create_root_list(ls); + dlm_clear_toss_list(ls); /* * Add or remove nodes from the lockspace's ls_nodes list. diff --git a/trunk/fs/dlm/user.c b/trunk/fs/dlm/user.c index 7cbc6826239b..4f741546f4bb 100644 --- a/trunk/fs/dlm/user.c +++ b/trunk/fs/dlm/user.c @@ -24,7 +24,8 @@ #include "lvb_table.h" #include "user.h" -static const char name_prefix[] = "dlm"; +static const char *name_prefix="dlm"; +static struct miscdevice ctl_device; static const struct file_operations device_fops; #ifdef CONFIG_COMPAT @@ -81,8 +82,7 @@ struct dlm_lock_result32 { }; static void compat_input(struct dlm_write_request *kb, - struct dlm_write_request32 *kb32, - int max_namelen) + struct dlm_write_request32 *kb32) { kb->version[0] = kb32->version[0]; kb->version[1] = kb32->version[1]; @@ -112,11 +112,7 @@ static void compat_input(struct dlm_write_request *kb, kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr; kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb; memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); - if (kb->i.lock.namelen <= max_namelen) - memcpy(kb->i.lock.name, kb32->i.lock.name, - kb->i.lock.namelen); - else - kb->i.lock.namelen = max_namelen; + memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen); } } @@ -240,12 +236,12 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) spin_unlock(&proc->asts_spin); if (eol) { - spin_lock(&proc->locks_spin); + spin_lock(&ua->proc->locks_spin); if (!list_empty(&lkb->lkb_ownqueue)) { list_del_init(&lkb->lkb_ownqueue); dlm_put_lkb(lkb); } - spin_unlock(&proc->locks_spin); + spin_unlock(&ua->proc->locks_spin); } out: mutex_unlock(&ls->ls_clear_proc_locks); @@ -533,8 +529,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, if (proc) set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); - compat_input(kbuf, k32buf, - count - sizeof(struct dlm_write_request32)); + compat_input(kbuf, k32buf); kfree(k32buf); } #endif @@ -901,16 +896,14 @@ static const struct file_operations ctl_device_fops = { .owner = THIS_MODULE, }; -static struct miscdevice ctl_device = { - .name = "dlm-control", - .fops = &ctl_device_fops, - .minor = MISC_DYNAMIC_MINOR, -}; - int dlm_user_init(void) { int error; + ctl_device.name = "dlm-control"; + ctl_device.fops = &ctl_device_fops; + ctl_device.minor = MISC_DYNAMIC_MINOR; + error = misc_register(&ctl_device); if (error) log_print("misc_register failed for control device"); diff --git a/trunk/fs/dlm/util.c b/trunk/fs/dlm/util.c index 4d9c1f4e1bd1..963889cf6740 100644 --- a/trunk/fs/dlm/util.c +++ b/trunk/fs/dlm/util.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -14,14 +14,6 @@ #include "rcom.h" #include "util.h" -#define DLM_ERRNO_EDEADLK 35 -#define DLM_ERRNO_EBADR 53 -#define DLM_ERRNO_EBADSLT 57 -#define DLM_ERRNO_EPROTO 71 -#define DLM_ERRNO_EOPNOTSUPP 95 -#define DLM_ERRNO_ETIMEDOUT 110 -#define DLM_ERRNO_EINPROGRESS 115 - static void header_out(struct dlm_header *hd) { hd->h_version = cpu_to_le32(hd->h_version); @@ -38,54 +30,11 @@ static void header_in(struct dlm_header *hd) hd->h_length = le16_to_cpu(hd->h_length); } -/* higher errno values are inconsistent across architectures, so select - one set of values for on the wire */ - -static int to_dlm_errno(int err) -{ - switch (err) { - case -EDEADLK: - return -DLM_ERRNO_EDEADLK; - case -EBADR: - return -DLM_ERRNO_EBADR; - case -EBADSLT: - return -DLM_ERRNO_EBADSLT; - case -EPROTO: - return -DLM_ERRNO_EPROTO; - case -EOPNOTSUPP: - return -DLM_ERRNO_EOPNOTSUPP; - case -ETIMEDOUT: - return -DLM_ERRNO_ETIMEDOUT; - case -EINPROGRESS: - return -DLM_ERRNO_EINPROGRESS; - } - return err; -} - -static int from_dlm_errno(int err) -{ - switch (err) { - case -DLM_ERRNO_EDEADLK: - return -EDEADLK; - case -DLM_ERRNO_EBADR: - return -EBADR; - case -DLM_ERRNO_EBADSLT: - return -EBADSLT; - case -DLM_ERRNO_EPROTO: - return -EPROTO; - case -DLM_ERRNO_EOPNOTSUPP: - return -EOPNOTSUPP; - case -DLM_ERRNO_ETIMEDOUT: - return -ETIMEDOUT; - case -DLM_ERRNO_EINPROGRESS: - return -EINPROGRESS; - } - return err; -} - void dlm_message_out(struct dlm_message *ms) { - header_out(&ms->m_header); + struct dlm_header *hd = (struct dlm_header *) ms; + + header_out(hd); ms->m_type = cpu_to_le32(ms->m_type); ms->m_nodeid = cpu_to_le32(ms->m_nodeid); @@ -104,12 +53,14 @@ void dlm_message_out(struct dlm_message *ms) ms->m_rqmode = cpu_to_le32(ms->m_rqmode); ms->m_bastmode = cpu_to_le32(ms->m_bastmode); ms->m_asts = cpu_to_le32(ms->m_asts); - ms->m_result = cpu_to_le32(to_dlm_errno(ms->m_result)); + ms->m_result = cpu_to_le32(ms->m_result); } void dlm_message_in(struct dlm_message *ms) { - header_in(&ms->m_header); + struct dlm_header *hd = (struct dlm_header *) ms; + + header_in(hd); ms->m_type = le32_to_cpu(ms->m_type); ms->m_nodeid = le32_to_cpu(ms->m_nodeid); @@ -128,7 +79,7 @@ void dlm_message_in(struct dlm_message *ms) ms->m_rqmode = le32_to_cpu(ms->m_rqmode); ms->m_bastmode = le32_to_cpu(ms->m_bastmode); ms->m_asts = le32_to_cpu(ms->m_asts); - ms->m_result = from_dlm_errno(le32_to_cpu(ms->m_result)); + ms->m_result = le32_to_cpu(ms->m_result); } static void rcom_lock_out(struct rcom_lock *rl) @@ -175,9 +126,10 @@ static void rcom_config_in(struct rcom_config *rf) void dlm_rcom_out(struct dlm_rcom *rc) { + struct dlm_header *hd = (struct dlm_header *) rc; int type = rc->rc_type; - header_out(&rc->rc_header); + header_out(hd); rc->rc_type = cpu_to_le32(rc->rc_type); rc->rc_result = cpu_to_le32(rc->rc_result); @@ -185,7 +137,7 @@ void dlm_rcom_out(struct dlm_rcom *rc) rc->rc_seq = cpu_to_le64(rc->rc_seq); rc->rc_seq_reply = cpu_to_le64(rc->rc_seq_reply); - if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) + if (type == DLM_RCOM_LOCK) rcom_lock_out((struct rcom_lock *) rc->rc_buf); else if (type == DLM_RCOM_STATUS_REPLY) @@ -194,9 +146,9 @@ void dlm_rcom_out(struct dlm_rcom *rc) void dlm_rcom_in(struct dlm_rcom *rc) { - int type; + struct dlm_header *hd = (struct dlm_header *) rc; - header_in(&rc->rc_header); + header_in(hd); rc->rc_type = le32_to_cpu(rc->rc_type); rc->rc_result = le32_to_cpu(rc->rc_result); @@ -204,12 +156,10 @@ void dlm_rcom_in(struct dlm_rcom *rc) rc->rc_seq = le64_to_cpu(rc->rc_seq); rc->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); - type = rc->rc_type; - - if ((type == DLM_RCOM_LOCK) || (type == DLM_RCOM_LOCK_REPLY)) + if (rc->rc_type == DLM_RCOM_LOCK) rcom_lock_in((struct rcom_lock *) rc->rc_buf); - else if (type == DLM_RCOM_STATUS_REPLY) + else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) rcom_config_in((struct rcom_config *) rc->rc_buf); } diff --git a/trunk/include/asm-x86/Kbuild b/trunk/include/asm-x86/Kbuild index 3c6f0f80e827..e6189b229143 100644 --- a/trunk/include/asm-x86/Kbuild +++ b/trunk/include/asm-x86/Kbuild @@ -3,7 +3,6 @@ include include/asm-generic/Kbuild.asm header-y += boot.h header-y += bootparam.h header-y += debugreg.h -header-y += kvm.h header-y += ldt.h header-y += msr-index.h header-y += prctl.h diff --git a/trunk/include/asm-x86/kvm.h b/trunk/include/asm-x86/kvm.h deleted file mode 100644 index 7a71120426a3..000000000000 --- a/trunk/include/asm-x86/kvm.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef __LINUX_KVM_X86_H -#define __LINUX_KVM_X86_H - -/* - * KVM x86 specific structures and definitions - * - */ - -#include -#include - -/* Architectural interrupt line count. */ -#define KVM_NR_INTERRUPTS 256 - -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - -/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ -struct kvm_pic_state { - __u8 last_irr; /* edge detection */ - __u8 irr; /* interrupt request register */ - __u8 imr; /* interrupt mask register */ - __u8 isr; /* interrupt service register */ - __u8 priority_add; /* highest irq priority */ - __u8 irq_base; - __u8 read_reg_select; - __u8 poll; - __u8 special_mask; - __u8 init_state; - __u8 auto_eoi; - __u8 rotate_on_auto_eoi; - __u8 special_fully_nested_mode; - __u8 init4; /* true if 4 byte init */ - __u8 elcr; /* PIIX edge/trigger selection */ - __u8 elcr_mask; -}; - -#define KVM_IOAPIC_NUM_PINS 24 -struct kvm_ioapic_state { - __u64 base_address; - __u32 ioregsel; - __u32 id; - __u32 irr; - __u32 pad; - union { - __u64 bits; - struct { - __u8 vector; - __u8 delivery_mode:3; - __u8 dest_mode:1; - __u8 delivery_status:1; - __u8 polarity:1; - __u8 remote_irr:1; - __u8 trig_mode:1; - __u8 mask:1; - __u8 reserve:7; - __u8 reserved[4]; - __u8 dest_id; - } fields; - } redirtbl[KVM_IOAPIC_NUM_PINS]; -}; - -#define KVM_IRQCHIP_PIC_MASTER 0 -#define KVM_IRQCHIP_PIC_SLAVE 1 -#define KVM_IRQCHIP_IOAPIC 2 - -/* for KVM_GET_REGS and KVM_SET_REGS */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -struct kvm_segment { - __u64 base; - __u32 limit; - __u16 selector; - __u8 type; - __u8 present, dpl, db, s, l, g, avl; - __u8 unusable; - __u8 padding; -}; - -struct kvm_dtable { - __u64 base; - __u16 limit; - __u16 padding[3]; -}; - - -/* for KVM_GET_SREGS and KVM_SET_SREGS */ -struct kvm_sregs { - /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -/* for KVM_GET_MSRS and KVM_SET_MSRS */ -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -/* for KVM_GET_MSR_INDEX_LIST */ -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 -#define KVM_CPUID_FLAG_STATEFUL_FUNC 2 -#define KVM_CPUID_FLAG_STATE_READ_NEXT 4 - -/* for KVM_SET_CPUID2 */ -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#endif diff --git a/trunk/include/asm-x86/kvm_para.h b/trunk/include/asm-x86/kvm_para.h deleted file mode 100644 index c6f3fd8d8c53..000000000000 --- a/trunk/include/asm-x86/kvm_para.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef __X86_KVM_PARA_H -#define __X86_KVM_PARA_H - -/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It - * should be used to determine that a VM is running under KVM. - */ -#define KVM_CPUID_SIGNATURE 0x40000000 - -/* This CPUID returns a feature bitmap in eax. Before enabling a particular - * paravirtualization, the appropriate feature bit should be checked. - */ -#define KVM_CPUID_FEATURES 0x40000001 - -#ifdef __KERNEL__ -#include - -/* This instruction is vmcall. On non-VT architectures, it will generate a - * trap that we will then rewrite to the appropriate instruction. - */ -#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" - -/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun - * instruction. The hypervisor may replace it with something else but only the - * instructions are guaranteed to be supported. - * - * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. - * The hypercall number should be placed in rax and the return value will be - * placed in rax. No other registers will be clobbered unless explicited - * noted by the particular hypercall. - */ - -static inline long kvm_hypercall0(unsigned int nr) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr)); - return ret; -} - -static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1)); - return ret; -} - -static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, - unsigned long p2) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2)); - return ret; -} - -static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, - unsigned long p2, unsigned long p3) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2), "d"(p3)); - return ret; -} - -static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)); - return ret; -} - -static inline int kvm_para_available(void) -{ - unsigned int eax, ebx, ecx, edx; - char signature[13]; - - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; - - if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; - - return 0; -} - -static inline unsigned int kvm_arch_para_features(void) -{ - return cpuid_eax(KVM_CPUID_FEATURES); -} - -#endif - -#endif diff --git a/trunk/include/asm-x86/lguest.h b/trunk/include/asm-x86/lguest.h index 4d9367b72976..1c8367a692f6 100644 --- a/trunk/include/asm-x86/lguest.h +++ b/trunk/include/asm-x86/lguest.h @@ -56,7 +56,7 @@ struct lguest_ro_state struct desc_struct guest_gdt[GDT_ENTRIES]; }; -struct lg_cpu_arch +struct lguest_arch { /* The GDT entries copied into lguest_ro_state when running. */ struct desc_struct gdt[GDT_ENTRIES]; diff --git a/trunk/include/asm-x86/lguest_hcall.h b/trunk/include/asm-x86/lguest_hcall.h index 758b9a5d4539..2091779e91fb 100644 --- a/trunk/include/asm-x86/lguest_hcall.h +++ b/trunk/include/asm-x86/lguest_hcall.h @@ -4,7 +4,7 @@ #define LHCALL_FLUSH_ASYNC 0 #define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 +#define LHCALL_CRASH 2 #define LHCALL_LOAD_GDT 3 #define LHCALL_NEW_PGTABLE 4 #define LHCALL_FLUSH_TLB 5 @@ -20,10 +20,6 @@ #define LGUEST_TRAP_ENTRY 0x1F -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - #ifndef __ASSEMBLY__ #include diff --git a/trunk/include/linux/Kbuild b/trunk/include/linux/Kbuild index 85b2482cc736..27b9350052b4 100644 --- a/trunk/include/linux/Kbuild +++ b/trunk/include/linux/Kbuild @@ -100,6 +100,7 @@ header-y += iso_fs.h header-y += ixjuser.h header-y += jffs2.h header-y += keyctl.h +header-y += kvm.h header-y += limits.h header-y += lock_dlm_plock.h header-y += magic.h @@ -255,7 +256,6 @@ unifdef-y += kd.h unifdef-y += kernelcapi.h unifdef-y += kernel.h unifdef-y += keyboard.h -unifdef-$(CONFIG_HAVE_KVM) += kvm.h unifdef-y += llc.h unifdef-y += loop.h unifdef-y += lp.h diff --git a/trunk/include/linux/audit.h b/trunk/include/linux/audit.h index bdd6f5de5fc4..c68781692838 100644 --- a/trunk/include/linux/audit.h +++ b/trunk/include/linux/audit.h @@ -115,8 +115,6 @@ #define AUDIT_MAC_IPSEC_ADDSPD 1413 /* Not used */ #define AUDIT_MAC_IPSEC_DELSPD 1414 /* Not used */ #define AUDIT_MAC_IPSEC_EVENT 1415 /* Audit an IPSec event */ -#define AUDIT_MAC_UNLBL_STCADD 1416 /* NetLabel: add a static label */ -#define AUDIT_MAC_UNLBL_STCDEL 1417 /* NetLabel: del a static label */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/trunk/include/linux/device.h b/trunk/include/linux/device.h index db375be333c7..1880208964d6 100644 --- a/trunk/include/linux/device.h +++ b/trunk/include/linux/device.h @@ -84,9 +84,6 @@ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, struct device *bus_find_device(struct bus_type *bus, struct device *start, void *data, int (*match)(struct device *dev, void *data)); -struct device *bus_find_device_by_name(struct bus_type *bus, - struct device *start, - const char *name); int __must_check bus_for_each_drv(struct bus_type *bus, struct device_driver *start, void *data, diff --git a/trunk/include/linux/kvm.h b/trunk/include/linux/kvm.h index 4de4fd2d8607..057a7f34ee36 100644 --- a/trunk/include/linux/kvm.h +++ b/trunk/include/linux/kvm.h @@ -9,10 +9,12 @@ #include #include -#include #define KVM_API_VERSION 12 +/* Architectural interrupt line count. */ +#define KVM_NR_INTERRUPTS 256 + /* for KVM_CREATE_MEMORY_REGION */ struct kvm_memory_region { __u32 slot; @@ -21,18 +23,16 @@ struct kvm_memory_region { __u64 memory_size; /* bytes */ }; -/* for KVM_SET_USER_MEMORY_REGION */ -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL +struct kvm_memory_alias { + __u32 slot; /* this has a different namespace than memory slots */ + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 target_phys_addr; +}; /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -45,18 +45,62 @@ struct kvm_irq_level { __u32 level; }; +/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ +struct kvm_pic_state { + __u8 last_irr; /* edge detection */ + __u8 irr; /* interrupt request register */ + __u8 imr; /* interrupt mask register */ + __u8 isr; /* interrupt service register */ + __u8 priority_add; /* highest irq priority */ + __u8 irq_base; + __u8 read_reg_select; + __u8 poll; + __u8 special_mask; + __u8 init_state; + __u8 auto_eoi; + __u8 rotate_on_auto_eoi; + __u8 special_fully_nested_mode; + __u8 init4; /* true if 4 byte init */ + __u8 elcr; /* PIIX edge/trigger selection */ + __u8 elcr_mask; +}; + +#define KVM_IOAPIC_NUM_PINS 24 +struct kvm_ioapic_state { + __u64 base_address; + __u32 ioregsel; + __u32 id; + __u32 irr; + __u32 pad; + union { + __u64 bits; + struct { + __u8 vector; + __u8 delivery_mode:3; + __u8 dest_mode:1; + __u8 delivery_status:1; + __u8 polarity:1; + __u8 remote_irr:1; + __u8 trig_mode:1; + __u8 mask:1; + __u8 reserve:7; + __u8 reserved[4]; + __u8 dest_id; + } fields; + } redirtbl[KVM_IOAPIC_NUM_PINS]; +}; + +#define KVM_IRQCHIP_PIC_MASTER 0 +#define KVM_IRQCHIP_PIC_SLAVE 1 +#define KVM_IRQCHIP_IOAPIC 2 struct kvm_irqchip { __u32 chip_id; __u32 pad; union { char dummy[512]; /* reserving space */ -#ifdef CONFIG_X86 struct kvm_pic_state pic; -#endif -#if defined(CONFIG_X86) || defined(CONFIG_IA64) struct kvm_ioapic_state ioapic; -#endif } chip; }; @@ -72,7 +116,6 @@ struct kvm_irqchip { #define KVM_EXIT_FAIL_ENTRY 9 #define KVM_EXIT_INTR 10 #define KVM_EXIT_SET_TPR 11 -#define KVM_EXIT_TPR_ACCESS 12 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { @@ -131,17 +174,90 @@ struct kvm_run { __u32 longmode; __u32 pad; } hypercall; - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; /* Fix the size of the union. */ char padding[256]; }; }; +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; +}; + +/* for KVM_GET_LAPIC and KVM_SET_LAPIC */ +#define KVM_APIC_REG_SIZE 0x400 +struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; +}; + +struct kvm_segment { + __u64 base; + __u32 limit; + __u16 selector; + __u8 type; + __u8 present, dpl, db, s, l, g, avl; + __u8 unusable; + __u8 padding; +}; + +struct kvm_dtable { + __u64 base; + __u16 limit; + __u16 padding[3]; +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { + /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; +}; + +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; +}; + +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; +}; + +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + /* for KVM_TRANSLATE */ struct kvm_translation { /* in */ @@ -186,22 +302,26 @@ struct kvm_dirty_log { }; }; -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; +struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; }; -/* for KVM_TPR_ACCESS_REPORTING */ -struct kvm_tpr_access_ctl { - __u32 enabled; - __u32 flags; - __u32 reserved[8]; +/* for KVM_SET_CPUID */ +struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; }; -/* for KVM_SET_VAPIC_ADDR */ -struct kvm_vapic_addr { - __u64 vapic_addr; +/* for KVM_SET_SIGNAL_MASK */ +struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; }; #define KVMIO 0xAE @@ -227,21 +347,11 @@ struct kvm_vapic_addr { */ #define KVM_CAP_IRQCHIP 0 #define KVM_CAP_HLT 1 -#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 -#define KVM_CAP_USER_MEMORY 3 -#define KVM_CAP_SET_TSS_ADDR 4 -#define KVM_CAP_EXT_CPUID 5 -#define KVM_CAP_VAPIC 6 /* * ioctls for VM fds */ #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) -#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) -#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) -#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ - struct kvm_userspace_memory_region) -#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) /* * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns * a vcpu fd. @@ -249,7 +359,6 @@ struct kvm_vapic_addr { #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) -#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) @@ -275,11 +384,5 @@ struct kvm_vapic_addr { #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) -#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) -#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) -/* Available with KVM_CAP_VAPIC */ -#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) -/* Available with KVM_CAP_VAPIC */ -#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) #endif diff --git a/trunk/include/linux/kvm_host.h b/trunk/include/linux/kvm_host.h deleted file mode 100644 index ea4764b0a2f4..000000000000 --- a/trunk/include/linux/kvm_host.h +++ /dev/null @@ -1,299 +0,0 @@ -#ifndef __KVM_HOST_H -#define __KVM_HOST_H - -/* - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include - -#define KVM_MAX_VCPUS 4 -#define KVM_MEMORY_SLOTS 8 -/* memory slots that does not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 4 - -#define KVM_PIO_PAGE_OFFSET 1 - -/* - * vcpu->requests bit members - */ -#define KVM_REQ_TLB_FLUSH 0 -#define KVM_REQ_MIGRATE_TIMER 1 -#define KVM_REQ_REPORT_TPR_ACCESS 2 - -struct kvm_vcpu; -extern struct kmem_cache *kvm_vcpu_cache; - -struct kvm_guest_debug { - int enabled; - unsigned long bp[4]; - int singlestep; -}; - -/* - * It would be nice to use something smarter than a linear search, TBD... - * Thankfully we dont expect many devices to register (famous last words :), - * so until then it will suffice. At least its abstracted so we can change - * in one place. - */ -struct kvm_io_bus { - int dev_count; -#define NR_IOBUS_DEVS 6 - struct kvm_io_device *devs[NR_IOBUS_DEVS]; -}; - -void kvm_io_bus_init(struct kvm_io_bus *bus); -void kvm_io_bus_destroy(struct kvm_io_bus *bus); -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, - struct kvm_io_device *dev); - -struct kvm_vcpu { - struct kvm *kvm; - struct preempt_notifier preempt_notifier; - int vcpu_id; - struct mutex mutex; - int cpu; - struct kvm_run *run; - int guest_mode; - unsigned long requests; - struct kvm_guest_debug guest_debug; - int fpu_active; - int guest_fpu_loaded; - wait_queue_head_t wq; - int sigset_active; - sigset_t sigset; - struct kvm_vcpu_stat stat; - -#ifdef CONFIG_HAS_IOMEM - int mmio_needed; - int mmio_read_completed; - int mmio_is_write; - int mmio_size; - unsigned char mmio_data[8]; - gpa_t mmio_phys_addr; -#endif - - struct kvm_vcpu_arch arch; -}; - -struct kvm_memory_slot { - gfn_t base_gfn; - unsigned long npages; - unsigned long flags; - unsigned long *rmap; - unsigned long *dirty_bitmap; - unsigned long userspace_addr; - int user_alloc; -}; - -struct kvm { - struct mutex lock; /* protects the vcpus array and APIC accesses */ - spinlock_t mmu_lock; - struct mm_struct *mm; /* userspace tied to this vm */ - int nmemslots; - struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + - KVM_PRIVATE_MEM_SLOTS]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; - struct list_head vm_list; - struct file *filp; - struct kvm_io_bus mmio_bus; - struct kvm_io_bus pio_bus; - struct kvm_vm_stat stat; - struct kvm_arch arch; -}; - -/* The guest did something we don't support. */ -#define pr_unimpl(vcpu, fmt, ...) \ - do { \ - if (printk_ratelimit()) \ - printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ - current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ - } while (0) - -#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) -#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) - -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); - -void vcpu_load(struct kvm_vcpu *vcpu); -void vcpu_put(struct kvm_vcpu *vcpu); - -void decache_vcpus_on_cpu(int cpu); - - -int kvm_init(void *opaque, unsigned int vcpu_size, - struct module *module); -void kvm_exit(void); - -#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) -#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) -static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); - -extern struct page *bad_page; - -int is_error_page(struct page *page); -int kvm_is_error_hva(unsigned long addr); -int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc); -int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc); -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc); -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); -void kvm_release_page_clean(struct page *page); -void kvm_release_page_dirty(struct page *page); -int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, - int len); -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len); -int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, - int offset, int len); -int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, - unsigned long len); -int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); -int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); -int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -void mark_page_dirty(struct kvm *kvm, gfn_t gfn); - -void kvm_vcpu_block(struct kvm_vcpu *vcpu); -void kvm_resched(struct kvm_vcpu *vcpu); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_flush_remote_tlbs(struct kvm *kvm); - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); - -int kvm_dev_ioctl_check_extension(long ext); - -int kvm_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log, int *is_dirty); -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log); - -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct - kvm_userspace_memory_region *mem, - int user_alloc); -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -void kvm_arch_destroy_vm(struct kvm *kvm); - -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); - -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr); - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs); -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs); -int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, - struct kvm_debug_guest *dbg); -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); - -int kvm_arch_init(void *opaque); -void kvm_arch_exit(void); - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); - -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); - -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); -void kvm_arch_hardware_enable(void *garbage); -void kvm_arch_hardware_disable(void *garbage); -int kvm_arch_hardware_setup(void); -void kvm_arch_hardware_unsetup(void); -void kvm_arch_check_processor_compat(void *rtn); -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); - -void kvm_free_physmem(struct kvm *kvm); - -struct kvm *kvm_arch_create_vm(void); -void kvm_arch_destroy_vm(struct kvm *kvm); - -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -int kvm_cpu_has_interrupt(struct kvm_vcpu *v); -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); - -static inline void kvm_guest_enter(void) -{ - account_system_vtime(current); - current->flags |= PF_VCPU; -} - -static inline void kvm_guest_exit(void) -{ - account_system_vtime(current); - current->flags &= ~PF_VCPU; -} - -static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - return slot - kvm->memslots; -} - -static inline gpa_t gfn_to_gpa(gfn_t gfn) -{ - return (gpa_t)gfn << PAGE_SHIFT; -} - -static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - -enum kvm_stat_kind { - KVM_STAT_VM, - KVM_STAT_VCPU, -}; - -struct kvm_stats_debugfs_item { - const char *name; - int offset; - enum kvm_stat_kind kind; - struct dentry *dentry; -}; -extern struct kvm_stats_debugfs_item debugfs_entries[]; - -#endif diff --git a/trunk/include/linux/kvm_para.h b/trunk/include/linux/kvm_para.h index 5497aac0d2f8..3b292565a693 100644 --- a/trunk/include/linux/kvm_para.h +++ b/trunk/include/linux/kvm_para.h @@ -2,30 +2,72 @@ #define __LINUX_KVM_PARA_H /* - * This header file provides a method for making a hypercall to the host - * Architectures should define: - * - kvm_hypercall0, kvm_hypercall1... - * - kvm_arch_para_features - * - kvm_para_available + * Guest OS interface for KVM paravirtualization + * + * Note: this interface is totally experimental, and is certain to change + * as we make progress. */ -/* Return values for hypercalls */ -#define KVM_ENOSYS 1000 +/* + * Per-VCPU descriptor area shared between guest and host. Writable to + * both guest and host. Registered with the host by the guest when + * a guest acknowledges paravirtual mode. + * + * NOTE: all addresses are guest-physical addresses (gpa), to make it + * easier for the hypervisor to map between the various addresses. + */ +struct kvm_vcpu_para_state { + /* + * API version information for compatibility. If there's any support + * mismatch (too old host trying to execute too new guest) then + * the host will deny entry into paravirtual mode. Any other + * combination (new host + old guest and new host + new guest) + * is supposed to work - new host versions will support all old + * guest API versions. + */ + u32 guest_version; + u32 host_version; + u32 size; + u32 ret; + + /* + * The address of the vm exit instruction (VMCALL or VMMCALL), + * which the host will patch according to the CPU model the + * VM runs on: + */ + u64 hypercall_gpa; + +} __attribute__ ((aligned(PAGE_SIZE))); + +#define KVM_PARA_API_VERSION 1 + +/* + * This is used for an RDMSR's ECX parameter to probe for a KVM host. + * Hopefully no CPU vendor will use up this number. This is placed well + * out of way of the typical space occupied by CPU vendors' MSR indices, + * and we think (or at least hope) it wont be occupied in the future + * either. + */ +#define MSR_KVM_API_MAGIC 0x87655678 -#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_EINVAL 1 /* - * hypercalls use architecture specific + * Hypercall calling convention: + * + * Each hypercall may have 0-6 parameters. + * + * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 + * + * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention + * order: RDI, RSI, RDX, RCX, R8, R9. + * + * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. + * (the first 3 are according to the gcc regparm calling convention) + * + * No registers are clobbered by the hypercall, except that the + * return value is in RAX. */ -#include - -#ifdef __KERNEL__ -static inline int kvm_para_has_feature(unsigned int feature) -{ - if (kvm_arch_para_features() & (1UL << feature)) - return 1; - return 0; -} -#endif /* __KERNEL__ */ -#endif /* __LINUX_KVM_PARA_H */ +#define __NR_hypercalls 0 +#endif diff --git a/trunk/include/linux/kvm_types.h b/trunk/include/linux/kvm_types.h deleted file mode 100644 index 1c4e46decb22..000000000000 --- a/trunk/include/linux/kvm_types.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#ifndef __KVM_TYPES_H__ -#define __KVM_TYPES_H__ - -#include - -/* - * Address types: - * - * gva - guest virtual address - * gpa - guest physical address - * gfn - guest frame number - * hva - host virtual address - * hpa - host physical address - * hfn - host frame number - */ - -typedef unsigned long gva_t; -typedef u64 gpa_t; -typedef unsigned long gfn_t; - -typedef unsigned long hva_t; -typedef u64 hpa_t; -typedef unsigned long hfn_t; - -struct kvm_pio_request { - unsigned long count; - int cur_count; - struct page *guest_pages[2]; - unsigned guest_page_offset; - int in; - int port; - int size; - int string; - int down; - int rep; -}; - -#endif /* __KVM_TYPES_H__ */ diff --git a/trunk/include/linux/selinux.h b/trunk/include/linux/selinux.h index 8c2cc4c02526..6080f73fc85f 100644 --- a/trunk/include/linux/selinux.h +++ b/trunk/include/linux/selinux.h @@ -120,35 +120,16 @@ void selinux_get_task_sid(struct task_struct *tsk, u32 *sid); int selinux_string_to_sid(char *str, u32 *sid); /** - * selinux_secmark_relabel_packet_permission - secmark permission check - * @sid: SECMARK ID value to be applied to network packet + * selinux_relabel_packet_permission - check permission to relabel a packet + * @sid: ID value to be applied to network packet (via SECMARK, most likely) * - * Returns 0 if the current task is allowed to set the SECMARK label of - * packets with the supplied security ID. Note that it is implicit that - * the packet is always being relabeled from the default unlabeled value, - * and that the access control decision is made in the AVC. + * Returns 0 if the current task is allowed to label packets with the + * supplied security ID. Note that it is implicit that the packet is always + * being relabeled from the default unlabled value, and that the access + * control decision is made in the AVC. */ -int selinux_secmark_relabel_packet_permission(u32 sid); +int selinux_relabel_packet_permission(u32 sid); -/** - * selinux_secmark_refcount_inc - increments the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function incements this reference count to indicate that a new SECMARK - * target has been configured. - */ -void selinux_secmark_refcount_inc(void); - -/** - * selinux_secmark_refcount_dec - decrements the secmark use counter - * - * SELinux keeps track of the current SECMARK targets in use so it knows - * when to apply SECMARK label access checks to network packets. This - * function decements this reference count to indicate that one of the - * existing SECMARK targets has been removed/flushed. - */ -void selinux_secmark_refcount_dec(void); #else static inline int selinux_audit_rule_init(u32 field, u32 op, @@ -203,21 +184,11 @@ static inline int selinux_string_to_sid(const char *str, u32 *sid) return 0; } -static inline int selinux_secmark_relabel_packet_permission(u32 sid) +static inline int selinux_relabel_packet_permission(u32 sid) { return 0; } -static inline void selinux_secmark_refcount_inc(void) -{ - return; -} - -static inline void selinux_secmark_refcount_dec(void) -{ - return; -} - #endif /* CONFIG_SECURITY_SELINUX */ #endif /* _LINUX_SELINUX_H */ diff --git a/trunk/include/net/netlabel.h b/trunk/include/net/netlabel.h index b3213c7c5309..2e5b2f6f9fa0 100644 --- a/trunk/include/net/netlabel.h +++ b/trunk/include/net/netlabel.h @@ -67,11 +67,7 @@ * NetLabel NETLINK protocol */ -/* NetLabel NETLINK protocol version - * 1: initial version - * 2: added static labels for unlabeled connections - */ -#define NETLBL_PROTO_VERSION 2 +#define NETLBL_PROTO_VERSION 1 /* NetLabel NETLINK types/families */ #define NETLBL_NLTYPE_NONE 0 @@ -109,49 +105,17 @@ struct netlbl_dom_map; /* Domain mapping operations */ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); -/* - * LSM security attributes - */ - -/** - * struct netlbl_lsm_cache - NetLabel LSM security attribute cache - * @refcount: atomic reference counter - * @free: LSM supplied function to free the cache data - * @data: LSM supplied cache data - * - * Description: - * This structure is provided for LSMs which wish to make use of the NetLabel - * caching mechanism to store LSM specific data/attributes in the NetLabel - * cache. If the LSM has to perform a lot of translation from the NetLabel - * security attributes into it's own internal representation then the cache - * mechanism can provide a way to eliminate some or all of that translation - * overhead on a cache hit. - * - */ +/* LSM security attributes */ struct netlbl_lsm_cache { atomic_t refcount; void (*free) (const void *data); void *data; }; - -/** - * struct netlbl_lsm_secattr_catmap - NetLabel LSM secattr category bitmap - * @startbit: the value of the lowest order bit in the bitmap - * @bitmap: the category bitmap - * @next: pointer to the next bitmap "node" or NULL - * - * Description: - * This structure is used to represent category bitmaps. Due to the large - * number of categories supported by most labeling protocols it is not - * practical to transfer a full bitmap internally so NetLabel adopts a sparse - * bitmap structure modeled after SELinux's ebitmap structure. - * The catmap bitmap field MUST be a power of two in length and large +/* The catmap bitmap field MUST be a power of two in length and large * enough to hold at least 240 bits. Special care (i.e. check the code!) * should be used when changing these values as the LSM implementation * probably has functions which rely on the sizes of these types to speed - * processing. - * - */ + * processing. */ #define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 #define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) @@ -163,48 +127,22 @@ struct netlbl_lsm_secattr_catmap { NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_secattr_catmap *next; }; - -/** - * struct netlbl_lsm_secattr - NetLabel LSM security attributes - * @flags: indicate which attributes are contained in this structure - * @type: indicate the NLTYPE of the attributes - * @domain: the NetLabel LSM domain - * @cache: NetLabel LSM specific cache - * @attr.mls: MLS sensitivity label - * @attr.mls.cat: MLS category bitmap - * @attr.mls.lvl: MLS sensitivity level - * @attr.secid: LSM specific secid token - * - * Description: - * This structure is used to pass security attributes between NetLabel and the - * LSM modules. The flags field is used to specify which fields within the - * struct are valid and valid values can be created by bitwise OR'ing the - * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to - * specify domain specific configuration settings and is not usually used by - * NetLabel itself when returning security attributes to the LSM. - * - */ #define NETLBL_SECATTR_NONE 0x00000000 #define NETLBL_SECATTR_DOMAIN 0x00000001 #define NETLBL_SECATTR_CACHE 0x00000002 #define NETLBL_SECATTR_MLS_LVL 0x00000004 #define NETLBL_SECATTR_MLS_CAT 0x00000008 -#define NETLBL_SECATTR_SECID 0x00000010 #define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ - NETLBL_SECATTR_MLS_CAT | \ - NETLBL_SECATTR_SECID) + NETLBL_SECATTR_MLS_CAT) struct netlbl_lsm_secattr { u32 flags; - u32 type; + char *domain; + + u32 mls_lvl; + struct netlbl_lsm_secattr_catmap *mls_cat; + struct netlbl_lsm_cache *cache; - union { - struct { - struct netlbl_lsm_secattr_catmap *cat; - u32 lvl; - } mls; - u32 secid; - } attr; }; /* @@ -293,7 +231,10 @@ static inline void netlbl_secattr_catmap_free( */ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) { - memset(secattr, 0, sizeof(*secattr)); + secattr->flags = 0; + secattr->domain = NULL; + secattr->mls_cat = NULL; + secattr->cache = NULL; } /** @@ -307,11 +248,11 @@ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) */ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) { - kfree(secattr->domain); - if (secattr->flags & NETLBL_SECATTR_CACHE) + if (secattr->cache) netlbl_secattr_cache_free(secattr->cache); - if (secattr->flags & NETLBL_SECATTR_MLS_CAT) - netlbl_secattr_catmap_free(secattr->attr.mls.cat); + kfree(secattr->domain); + if (secattr->mls_cat) + netlbl_secattr_catmap_free(secattr->mls_cat); } /** @@ -359,7 +300,7 @@ int netlbl_secattr_catmap_setrng(struct netlbl_lsm_secattr_catmap *catmap, gfp_t flags); /* - * LSM protocol operations (NetLabel LSM/kernel API) + * LSM protocol operations */ int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, @@ -367,7 +308,6 @@ int netlbl_sock_setattr(struct sock *sk, int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int netlbl_skbuff_getattr(const struct sk_buff *skb, - u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, int error); @@ -420,7 +360,6 @@ static inline int netlbl_sock_getattr(struct sock *sk, return -ENOSYS; } static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, - u16 family, struct netlbl_lsm_secattr *secattr) { return -ENOSYS; diff --git a/trunk/include/scsi/scsi.h b/trunk/include/scsi/scsi.h index 82251575a9b4..702fcfeb37f1 100644 --- a/trunk/include/scsi/scsi.h +++ b/trunk/include/scsi/scsi.h @@ -10,25 +10,6 @@ #include -/* - * The maximum number of SG segments that we will put inside a - * scatterlist (unless chaining is used). Should ideally fit inside a - * single page, to avoid a higher order allocation. We could define this - * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The - * minimum value is 32 - */ -#define SCSI_MAX_SG_SEGMENTS 128 - -/* - * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit - * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. - */ -#ifdef ARCH_HAS_SG_CHAIN -#define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 -#else -#define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS -#endif - /* * SCSI command lengths */ @@ -102,7 +83,6 @@ extern const unsigned char scsi_command_size[8]; #define READ_TOC 0x43 #define LOG_SELECT 0x4c #define LOG_SENSE 0x4d -#define XDWRITEREAD_10 0x53 #define MODE_SELECT_10 0x55 #define RESERVE_10 0x56 #define RELEASE_10 0x57 diff --git a/trunk/include/scsi/scsi_cmnd.h b/trunk/include/scsi/scsi_cmnd.h index de28aab820b0..a457fca66f61 100644 --- a/trunk/include/scsi/scsi_cmnd.h +++ b/trunk/include/scsi/scsi_cmnd.h @@ -2,20 +2,15 @@ #define _SCSI_SCSI_CMND_H #include -#include #include #include #include #include +struct request; struct Scsi_Host; struct scsi_device; -struct scsi_data_buffer { - struct sg_table table; - unsigned length; - int resid; -}; /* embedded in scsi_cmnd */ struct scsi_pointer { @@ -66,11 +61,15 @@ struct scsi_cmnd { /* These elements define the operation we are about to perform */ #define MAX_COMMAND_SIZE 16 unsigned char cmnd[MAX_COMMAND_SIZE]; + unsigned request_bufflen; /* Actual request size */ struct timer_list eh_timeout; /* Used to time out the command. */ + void *request_buffer; /* Actual requested buffer */ /* These elements define the operation we ultimately want to perform */ - struct scsi_data_buffer sdb; + struct sg_table sg_table; + unsigned short use_sg; /* Number of pieces of scatter-gather */ + unsigned underflow; /* Return error if less than this amount is transferred */ @@ -80,6 +79,10 @@ struct scsi_cmnd { reconnects. Probably == sector size */ + int resid; /* Number of bytes requested to be + transferred less actual number + transferred (0 if not supported) */ + struct request *request; /* The command we are working on */ @@ -124,55 +127,27 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); -extern int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask); -extern void scsi_release_buffers(struct scsi_cmnd *cmd); +extern int scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t); +extern void scsi_free_sgtable(struct scsi_cmnd *); extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); -static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) -{ - return cmd->sdb.table.nents; -} - -static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) -{ - return cmd->sdb.table.sgl; -} - -static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) -{ - return cmd->sdb.length; -} +#define scsi_sg_count(cmd) ((cmd)->use_sg) +#define scsi_sglist(cmd) ((cmd)->sg_table.sgl) +#define scsi_bufflen(cmd) ((cmd)->request_bufflen) static inline void scsi_set_resid(struct scsi_cmnd *cmd, int resid) { - cmd->sdb.resid = resid; + cmd->resid = resid; } static inline int scsi_get_resid(struct scsi_cmnd *cmd) { - return cmd->sdb.resid; + return cmd->resid; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) -static inline int scsi_bidi_cmnd(struct scsi_cmnd *cmd) -{ - return blk_bidi_rq(cmd->request) && - (cmd->request->next_rq->special != NULL); -} - -static inline struct scsi_data_buffer *scsi_in(struct scsi_cmnd *cmd) -{ - return scsi_bidi_cmnd(cmd) ? - cmd->request->next_rq->special : &cmd->sdb; -} - -static inline struct scsi_data_buffer *scsi_out(struct scsi_cmnd *cmd) -{ - return &cmd->sdb; -} - #endif /* _SCSI_SCSI_CMND_H */ diff --git a/trunk/include/scsi/scsi_eh.h b/trunk/include/scsi/scsi_eh.h index 25071d5d9bf8..d21b8913ceb3 100644 --- a/trunk/include/scsi/scsi_eh.h +++ b/trunk/include/scsi/scsi_eh.h @@ -68,15 +68,16 @@ extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, extern int scsi_reset_provider(struct scsi_device *, int); struct scsi_eh_save { - /* saved state */ int result; enum dma_data_direction data_direction; unsigned char cmd_len; unsigned char cmnd[MAX_COMMAND_SIZE]; - struct scsi_data_buffer sdb; - struct request *next_rq; - /* new command support */ + void *buffer; + unsigned bufflen; + unsigned short use_sg; + int resid; + struct scatterlist sense_sgl; }; diff --git a/trunk/include/scsi/scsi_host.h b/trunk/include/scsi/scsi_host.h index 5c58d594126a..0fd4746ee39d 100644 --- a/trunk/include/scsi/scsi_host.h +++ b/trunk/include/scsi/scsi_host.h @@ -39,6 +39,9 @@ struct blk_queue_tags; #define DISABLE_CLUSTERING 0 #define ENABLE_CLUSTERING 1 +#define DISABLE_SG_CHAINING 0 +#define ENABLE_SG_CHAINING 1 + enum scsi_eh_timer_return { EH_NOT_HANDLED, EH_HANDLED, @@ -133,9 +136,9 @@ struct scsi_host_template { * the done callback is invoked. * * This is called to inform the LLD to transfer - * scsi_bufflen(cmd) bytes. scsi_sg_count(cmd) speciefies the + * cmd->request_bufflen bytes. The cmd->use_sg speciefies the * number of scatterlist entried in the command and - * scsi_sglist(cmd) returns the scatterlist. + * cmd->request_buffer contains the scatterlist. * * return values: see queuecommand * @@ -442,6 +445,15 @@ struct scsi_host_template { */ unsigned ordered_tag:1; + /* + * true if the low-level driver can support sg chaining. this + * will be removed eventually when all the drivers are + * converted to support sg chaining. + * + * Status: OBSOLETE + */ + unsigned use_sg_chaining:1; + /* * Countdown for host blocking with no commands outstanding */ @@ -586,6 +598,7 @@ struct Scsi_Host { unsigned unchecked_isa_dma:1; unsigned use_clustering:1; unsigned use_blk_tcq:1; + unsigned use_sg_chaining:1; /* * Host has requested that no further requests come through for the diff --git a/trunk/kernel/fork.c b/trunk/kernel/fork.c index 05e0b6f4365b..314f5101d2b0 100644 --- a/trunk/kernel/fork.c +++ b/trunk/kernel/fork.c @@ -393,7 +393,6 @@ void fastcall __mmdrop(struct mm_struct *mm) destroy_context(mm); free_mm(mm); } -EXPORT_SYMBOL_GPL(__mmdrop); /* * Decrement the use count and release all resources for an mm. diff --git a/trunk/net/ipv4/cipso_ipv4.c b/trunk/net/ipv4/cipso_ipv4.c index a2241060113b..d4dc4eb48d95 100644 --- a/trunk/net/ipv4/cipso_ipv4.c +++ b/trunk/net/ipv4/cipso_ipv4.c @@ -348,7 +348,6 @@ static int cipso_v4_cache_check(const unsigned char *key, atomic_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; - secattr->type = NETLBL_NLTYPE_CIPSOV4; if (prev_entry == NULL) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; @@ -866,7 +865,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, } for (;;) { - host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, + host_spot = netlbl_secattr_catmap_walk(secattr->mls_cat, host_spot + 1); if (host_spot < 0) break; @@ -949,7 +948,7 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, return -EPERM; break; } - ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, + ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, host_spot, GFP_ATOMIC); if (ret_val != 0) @@ -1015,8 +1014,7 @@ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, u32 cat_iter = 0; for (;;) { - cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, - cat + 1); + cat = netlbl_secattr_catmap_walk(secattr->mls_cat, cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) @@ -1051,7 +1049,7 @@ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { - ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat, + ret_val = netlbl_secattr_catmap_setbit(secattr->mls_cat, ntohs(get_unaligned((__be16 *)&net_cat[iter])), GFP_ATOMIC); if (ret_val != 0) @@ -1132,8 +1130,7 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, return -ENOSPC; for (;;) { - iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat, - iter + 1); + iter = netlbl_secattr_catmap_walk(secattr->mls_cat, iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); @@ -1141,8 +1138,7 @@ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, return -ENOSPC; array[array_cnt++] = iter; - iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat, - iter); + iter = netlbl_secattr_catmap_walk_rng(secattr->mls_cat, iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); @@ -1195,7 +1191,7 @@ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, else cat_low = 0; - ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat, + ret_val = netlbl_secattr_catmap_setrng(secattr->mls_cat, cat_low, cat_high, GFP_ATOMIC); @@ -1255,9 +1251,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, - secattr->attr.mls.lvl, - &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); if (ret_val != 0) return ret_val; @@ -1309,13 +1303,12 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->attr.mls.lvl = level; + secattr->mls_lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->attr.mls.cat == NULL) + secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->mls_cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, @@ -1323,7 +1316,7 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->attr.mls.cat); + netlbl_secattr_catmap_free(secattr->mls_cat); return ret_val; } @@ -1357,9 +1350,7 @@ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, - secattr->attr.mls.lvl, - &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); if (ret_val != 0) return ret_val; @@ -1405,13 +1396,12 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->attr.mls.lvl = level; + secattr->mls_lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->attr.mls.cat == NULL) + secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->mls_cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, @@ -1419,7 +1409,7 @@ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->attr.mls.cat); + netlbl_secattr_catmap_free(secattr->mls_cat); return ret_val; } @@ -1453,9 +1443,7 @@ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; - ret_val = cipso_v4_map_lvl_hton(doi_def, - secattr->attr.mls.lvl, - &level); + ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level); if (ret_val != 0) return ret_val; @@ -1500,13 +1488,12 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; - secattr->attr.mls.lvl = level; + secattr->mls_lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { - secattr->attr.mls.cat = - netlbl_secattr_catmap_alloc(GFP_ATOMIC); - if (secattr->attr.mls.cat == NULL) + secattr->mls_cat = netlbl_secattr_catmap_alloc(GFP_ATOMIC); + if (secattr->mls_cat == NULL) return -ENOMEM; ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, @@ -1514,7 +1501,7 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, tag_len - 4, secattr); if (ret_val != 0) { - netlbl_secattr_catmap_free(secattr->attr.mls.cat); + netlbl_secattr_catmap_free(secattr->mls_cat); return ret_val; } @@ -1863,8 +1850,6 @@ static int cipso_v4_getattr(const unsigned char *cipso, ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; } - if (ret_val == 0) - secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); diff --git a/trunk/net/netfilter/xt_SECMARK.c b/trunk/net/netfilter/xt_SECMARK.c index 7708e2084ce2..b11b3ecbb39d 100644 --- a/trunk/net/netfilter/xt_SECMARK.c +++ b/trunk/net/netfilter/xt_SECMARK.c @@ -72,13 +72,12 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) return false; } - err = selinux_secmark_relabel_packet_permission(sel->selsid); + err = selinux_relabel_packet_permission(sel->selsid); if (err) { printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); return false; } - selinux_secmark_refcount_inc(); return true; } @@ -111,20 +110,11 @@ secmark_tg_check(const char *tablename, const void *entry, return true; } -void secmark_tg_destroy(const struct xt_target *target, void *targinfo) -{ - switch (mode) { - case SECMARK_MODE_SEL: - selinux_secmark_refcount_dec(); - } -} - static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .family = AF_INET, .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", @@ -134,7 +124,6 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { .name = "SECMARK", .family = AF_INET6, .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, .target = secmark_tg, .targetsize = sizeof(struct xt_secmark_target_info), .table = "mangle", diff --git a/trunk/net/netlabel/netlabel_cipso_v4.c b/trunk/net/netlabel/netlabel_cipso_v4.c index becf91a952ae..ba0ca8d3f77d 100644 --- a/trunk/net/netlabel/netlabel_cipso_v4.c +++ b/trunk/net/netlabel/netlabel_cipso_v4.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "netlabel_user.h" #include "netlabel_cipso_v4.h" @@ -422,7 +421,7 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) break; } if (ret_val == 0) - atomic_inc(&netlabel_mgmt_protocount); + netlbl_mgmt_protocount_inc(); audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_ADD, &audit_info); @@ -699,7 +698,7 @@ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) &audit_info, netlbl_cipsov4_doi_free); if (ret_val == 0) - atomic_dec(&netlabel_mgmt_protocount); + netlbl_mgmt_protocount_dec(); audit_buf = netlbl_audit_start_common(AUDIT_MAC_CIPSOV4_DEL, &audit_info); diff --git a/trunk/net/netlabel/netlabel_domainhash.c b/trunk/net/netlabel/netlabel_domainhash.c index 9a8ea0195c4f..b3675bd7db33 100644 --- a/trunk/net/netlabel/netlabel_domainhash.c +++ b/trunk/net/netlabel/netlabel_domainhash.c @@ -54,6 +54,9 @@ struct netlbl_domhsh_tbl { * hash table should be okay */ static DEFINE_SPINLOCK(netlbl_domhsh_lock); static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; + +/* Default domain mapping */ +static DEFINE_SPINLOCK(netlbl_domhsh_def_lock); static struct netlbl_dom_map *netlbl_domhsh_def = NULL; /* @@ -106,14 +109,17 @@ static u32 netlbl_domhsh_hash(const char *key) /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain + * @def: return default if no match is found * * Description: * Searches the domain hash table and returns a pointer to the hash table - * entry if found, otherwise NULL is returned. The caller is responsibile for - * the rcu hash table locks (i.e. the caller much call rcu_read_[un]lock()). + * entry if found, otherwise NULL is returned. If @def is non-zero and a + * match is not found in the domain hash table the default mapping is returned + * if it exists. The caller is responsibile for the rcu hash table locks + * (i.e. the caller much call rcu_read_[un]lock()). * */ -static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) +static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u32 def) { u32 bkt; struct netlbl_dom_map *iter; @@ -127,31 +133,10 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) return iter; } - return NULL; -} - -/** - * netlbl_domhsh_search_def - Search for a domain entry - * @domain: the domain - * @def: return default if no match is found - * - * Description: - * Searches the domain hash table and returns a pointer to the hash table - * entry if an exact match is found, if an exact match is not present in the - * hash table then the default entry is returned if valid otherwise NULL is - * returned. The caller is responsibile for the rcu hash table locks - * (i.e. the caller much call rcu_read_[un]lock()). - * - */ -static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) -{ - struct netlbl_dom_map *entry; - - entry = netlbl_domhsh_search(domain); - if (entry == NULL) { - entry = rcu_dereference(netlbl_domhsh_def); - if (entry != NULL && entry->valid) - return entry; + if (def != 0) { + iter = rcu_dereference(netlbl_domhsh_def); + if (iter != NULL && iter->valid) + return iter; } return NULL; @@ -236,22 +221,24 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, INIT_RCU_HEAD(&entry->rcu); rcu_read_lock(); - spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) { bkt = netlbl_domhsh_hash(entry->domain); - if (netlbl_domhsh_search(entry->domain) == NULL) + spin_lock(&netlbl_domhsh_lock); + if (netlbl_domhsh_search(entry->domain, 0) == NULL) list_add_tail_rcu(&entry->list, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); else ret_val = -EEXIST; + spin_unlock(&netlbl_domhsh_lock); } else { INIT_LIST_HEAD(&entry->list); + spin_lock(&netlbl_domhsh_def_lock); if (rcu_dereference(netlbl_domhsh_def) == NULL) rcu_assign_pointer(netlbl_domhsh_def, entry); else ret_val = -EEXIST; + spin_unlock(&netlbl_domhsh_def_lock); } - spin_unlock(&netlbl_domhsh_lock); audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, @@ -320,10 +307,7 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) struct audit_buffer *audit_buf; rcu_read_lock(); - if (domain) - entry = netlbl_domhsh_search(domain); - else - entry = netlbl_domhsh_search_def(domain); + entry = netlbl_domhsh_search(domain, (domain != NULL ? 0 : 1)); if (entry == NULL) goto remove_return; switch (entry->type) { @@ -332,16 +316,23 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) entry->domain); break; } - spin_lock(&netlbl_domhsh_lock); - if (entry->valid) { - entry->valid = 0; - if (entry != rcu_dereference(netlbl_domhsh_def)) + if (entry != rcu_dereference(netlbl_domhsh_def)) { + spin_lock(&netlbl_domhsh_lock); + if (entry->valid) { + entry->valid = 0; list_del_rcu(&entry->list); - else + ret_val = 0; + } + spin_unlock(&netlbl_domhsh_lock); + } else { + spin_lock(&netlbl_domhsh_def_lock); + if (entry->valid) { + entry->valid = 0; rcu_assign_pointer(netlbl_domhsh_def, NULL); - ret_val = 0; + ret_val = 0; + } + spin_unlock(&netlbl_domhsh_def_lock); } - spin_unlock(&netlbl_domhsh_lock); audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { @@ -386,7 +377,7 @@ int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) */ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) { - return netlbl_domhsh_search_def(domain); + return netlbl_domhsh_search(domain, 1); } /** diff --git a/trunk/net/netlabel/netlabel_kapi.c b/trunk/net/netlabel/netlabel_kapi.c index c69e3e1f05c3..4f50949722a9 100644 --- a/trunk/net/netlabel/netlabel_kapi.c +++ b/trunk/net/netlabel/netlabel_kapi.c @@ -34,7 +34,6 @@ #include #include #include -#include #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" @@ -263,7 +262,7 @@ int netlbl_enabled(void) /* At some point we probably want to expose this mechanism to the user * as well so that admins can toggle NetLabel regardless of the * configuration */ - return (atomic_read(&netlabel_mgmt_protocount) > 0); + return (netlbl_mgmt_protocount_value() > 0 ? 1 : 0); } /** @@ -312,7 +311,7 @@ int netlbl_sock_setattr(struct sock *sk, * @secattr: the security attributes * * Description: - * Examines the given sock to see if any NetLabel style labeling has been + * Examines the given sock to see any NetLabel style labeling has been * applied to the sock, if so it parses the socket label and returns the * security attributes in @secattr. Returns zero on success, negative values * on failure. @@ -320,13 +319,18 @@ int netlbl_sock_setattr(struct sock *sk, */ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { - return cipso_v4_sock_getattr(sk, secattr); + int ret_val; + + ret_val = cipso_v4_sock_getattr(sk, secattr); + if (ret_val == 0) + return 0; + + return netlbl_unlabel_getattr(secattr); } /** * netlbl_skbuff_getattr - Determine the security attributes of a packet * @skb: the packet - * @family: protocol family * @secattr: the security attributes * * Description: @@ -337,14 +341,13 @@ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) * */ int netlbl_skbuff_getattr(const struct sk_buff *skb, - u16 family, struct netlbl_lsm_secattr *secattr) { if (CIPSO_V4_OPTEXIST(skb) && cipso_v4_skbuff_getattr(skb, secattr) == 0) return 0; - return netlbl_unlabel_getattr(skb, family, secattr); + return netlbl_unlabel_getattr(secattr); } /** @@ -428,10 +431,6 @@ static int __init netlbl_init(void) if (ret_val != 0) goto init_failure; - ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE); - if (ret_val != 0) - goto init_failure; - ret_val = netlbl_netlink_init(); if (ret_val != 0) goto init_failure; diff --git a/trunk/net/netlabel/netlabel_mgmt.c b/trunk/net/netlabel/netlabel_mgmt.c index e2258dc3c845..9c41464d58d1 100644 --- a/trunk/net/netlabel/netlabel_mgmt.c +++ b/trunk/net/netlabel/netlabel_mgmt.c @@ -37,14 +37,14 @@ #include #include #include -#include #include "netlabel_domainhash.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" -/* NetLabel configured protocol counter */ -atomic_t netlabel_mgmt_protocount = ATOMIC_INIT(0); +/* NetLabel configured protocol count */ +static DEFINE_SPINLOCK(netlabel_mgmt_protocount_lock); +static u32 netlabel_mgmt_protocount = 0; /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { @@ -70,6 +70,63 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 }, }; +/* + * NetLabel Misc Management Functions + */ + +/** + * netlbl_mgmt_protocount_inc - Increment the configured labeled protocol count + * + * Description: + * Increment the number of labeled protocol configurations in the current + * NetLabel configuration. Keep track of this for use in determining if + * NetLabel label enforcement should be active/enabled or not in the LSM. + * + */ +void netlbl_mgmt_protocount_inc(void) +{ + spin_lock(&netlabel_mgmt_protocount_lock); + netlabel_mgmt_protocount++; + spin_unlock(&netlabel_mgmt_protocount_lock); +} + +/** + * netlbl_mgmt_protocount_dec - Decrement the configured labeled protocol count + * + * Description: + * Decrement the number of labeled protocol configurations in the current + * NetLabel configuration. Keep track of this for use in determining if + * NetLabel label enforcement should be active/enabled or not in the LSM. + * + */ +void netlbl_mgmt_protocount_dec(void) +{ + spin_lock(&netlabel_mgmt_protocount_lock); + if (netlabel_mgmt_protocount > 0) + netlabel_mgmt_protocount--; + spin_unlock(&netlabel_mgmt_protocount_lock); +} + +/** + * netlbl_mgmt_protocount_value - Return the number of configured protocols + * + * Description: + * Return the number of labeled protocols in the current NetLabel + * configuration. This value is useful in determining if NetLabel label + * enforcement should be active/enabled or not in the LSM. + * + */ +u32 netlbl_mgmt_protocount_value(void) +{ + u32 val; + + rcu_read_lock(); + val = netlabel_mgmt_protocount; + rcu_read_unlock(); + + return val; +} + /* * NetLabel Command Handlers */ diff --git a/trunk/net/netlabel/netlabel_mgmt.h b/trunk/net/netlabel/netlabel_mgmt.h index a43bff169d6b..ccb2b3923591 100644 --- a/trunk/net/netlabel/netlabel_mgmt.h +++ b/trunk/net/netlabel/netlabel_mgmt.h @@ -32,7 +32,6 @@ #define _NETLABEL_MGMT_H #include -#include /* * The following NetLabel payloads are supported by the management interface. @@ -169,7 +168,9 @@ enum { /* NetLabel protocol functions */ int netlbl_mgmt_genl_init(void); -/* NetLabel configured protocol reference counter */ -extern atomic_t netlabel_mgmt_protocount; +/* NetLabel misc management functions */ +void netlbl_mgmt_protocount_inc(void); +void netlbl_mgmt_protocount_dec(void); +u32 netlbl_mgmt_protocount_value(void); #endif diff --git a/trunk/net/netlabel/netlabel_unlabeled.c b/trunk/net/netlabel/netlabel_unlabeled.c index 42e81fd8cc49..348292450deb 100644 --- a/trunk/net/netlabel/netlabel_unlabeled.c +++ b/trunk/net/netlabel/netlabel_unlabeled.c @@ -10,7 +10,7 @@ */ /* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2007 + * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,938 +36,38 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include + #include #include -#include - -#include "netlabel_user.h" -#include "netlabel_domainhash.h" -#include "netlabel_unlabeled.h" -#include "netlabel_mgmt.h" - -/* NOTE: at present we always use init's network namespace since we don't - * presently support different namespaces even though the majority of - * the functions in this file are "namespace safe" */ - -/* The unlabeled connection hash table which we use to map network interfaces - * and addresses of unlabeled packets to a user specified secid value for the - * LSM. The hash table is used to lookup the network interface entry - * (struct netlbl_unlhsh_iface) and then the interface entry is used to - * lookup an IP address match from an ordered list. If a network interface - * match can not be found in the hash table then the default entry - * (netlbl_unlhsh_def) is used. The IP address entry list - * (struct netlbl_unlhsh_addr) is ordered such that the entries with a - * larger netmask come first. - */ -struct netlbl_unlhsh_tbl { - struct list_head *tbl; - u32 size; -}; -struct netlbl_unlhsh_addr4 { - __be32 addr; - __be32 mask; - u32 secid; - - u32 valid; - struct list_head list; - struct rcu_head rcu; -}; -struct netlbl_unlhsh_addr6 { - struct in6_addr addr; - struct in6_addr mask; - u32 secid; - - u32 valid; - struct list_head list; - struct rcu_head rcu; -}; -struct netlbl_unlhsh_iface { - int ifindex; - struct list_head addr4_list; - struct list_head addr6_list; - - u32 valid; - struct list_head list; - struct rcu_head rcu; -}; - -/* Argument struct for netlbl_unlhsh_walk() */ -struct netlbl_unlhsh_walk_arg { - struct netlink_callback *nl_cb; - struct sk_buff *skb; - u32 seq; -}; - -/* Unlabeled connection hash table */ -/* updates should be so rare that having one spinlock for the entire - * hash table should be okay */ -static DEFINE_SPINLOCK(netlbl_unlhsh_lock); -static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; - -/* Accept unlabeled packets flag */ -static u8 netlabel_unlabel_acceptflg = 0; - -/* NetLabel Generic NETLINK unlabeled family */ -static struct genl_family netlbl_unlabel_gnl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = NETLBL_NLTYPE_UNLABELED_NAME, - .version = NETLBL_PROTO_VERSION, - .maxattr = NLBL_UNLABEL_A_MAX, -}; -/* NetLabel Netlink attribute policy */ -static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { - [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, - [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, - [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, - [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ - 1 }, - [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY } -}; - -/* - * Audit Helper Functions - */ - -/** - * netlbl_unlabel_audit_addr4 - Audit an IPv4 address - * @audit_buf: audit buffer - * @dev: network interface - * @addr: IP address - * @mask: IP address mask - * - * Description: - * Write the IPv4 address and address mask, if necessary, to @audit_buf. - * - */ -static void netlbl_unlabel_audit_addr4(struct audit_buffer *audit_buf, - const char *dev, - __be32 addr, __be32 mask) -{ - u32 mask_val = ntohl(mask); - - if (dev != NULL) - audit_log_format(audit_buf, " netif=%s", dev); - audit_log_format(audit_buf, " src=" NIPQUAD_FMT, NIPQUAD(addr)); - if (mask_val != 0xffffffff) { - u32 mask_len = 0; - while (mask_val > 0) { - mask_val <<= 1; - mask_len++; - } - audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); - } -} - -/** - * netlbl_unlabel_audit_addr6 - Audit an IPv6 address - * @audit_buf: audit buffer - * @dev: network interface - * @addr: IP address - * @mask: IP address mask - * - * Description: - * Write the IPv6 address and address mask, if necessary, to @audit_buf. - * - */ -static void netlbl_unlabel_audit_addr6(struct audit_buffer *audit_buf, - const char *dev, - const struct in6_addr *addr, - const struct in6_addr *mask) -{ - if (dev != NULL) - audit_log_format(audit_buf, " netif=%s", dev); - audit_log_format(audit_buf, " src=" NIP6_FMT, NIP6(*addr)); - if (ntohl(mask->s6_addr32[3]) != 0xffffffff) { - u32 mask_len = 0; - u32 mask_val; - int iter = -1; - while (ntohl(mask->s6_addr32[++iter]) == 0xffffffff) - mask_len += 32; - mask_val = ntohl(mask->s6_addr32[iter]); - while (mask_val > 0) { - mask_val <<= 1; - mask_len++; - } - audit_log_format(audit_buf, " src_prefixlen=%d", mask_len); - } -} - -/* - * Unlabeled Connection Hash Table Functions - */ - -/** - * netlbl_unlhsh_free_addr4 - Frees an IPv4 address entry from the hash table - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table address entry can be - * released safely. - * - */ -static void netlbl_unlhsh_free_addr4(struct rcu_head *entry) -{ - struct netlbl_unlhsh_addr4 *ptr; - - ptr = container_of(entry, struct netlbl_unlhsh_addr4, rcu); - kfree(ptr); -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_free_addr6 - Frees an IPv6 address entry from the hash table - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table address entry can be - * released safely. - * - */ -static void netlbl_unlhsh_free_addr6(struct rcu_head *entry) -{ - struct netlbl_unlhsh_addr6 *ptr; - - ptr = container_of(entry, struct netlbl_unlhsh_addr6, rcu); - kfree(ptr); -} -#endif /* IPv6 */ - -/** - * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table - * @entry: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table interface entry can be - * released safely. It is important to note that this function does not free - * the IPv4 and IPv6 address lists contained as part of an interface entry. It - * is up to the rest of the code to make sure an interface entry is only freed - * once it's address lists are empty. - * - */ -static void netlbl_unlhsh_free_iface(struct rcu_head *entry) -{ - struct netlbl_unlhsh_iface *iface; - struct netlbl_unlhsh_addr4 *iter4; - struct netlbl_unlhsh_addr4 *tmp4; - struct netlbl_unlhsh_addr6 *iter6; - struct netlbl_unlhsh_addr6 *tmp6; - - iface = container_of(entry, struct netlbl_unlhsh_iface, rcu); - - /* no need for locks here since we are the only one with access to this - * structure */ - - list_for_each_entry_safe(iter4, tmp4, &iface->addr4_list, list) - if (iter4->valid) { - list_del_rcu(&iter4->list); - kfree(iter4); - } - list_for_each_entry_safe(iter6, tmp6, &iface->addr6_list, list) - if (iter6->valid) { - list_del_rcu(&iter6->list); - kfree(iter6); - } - kfree(iface); -} - -/** - * netlbl_unlhsh_hash - Hashing function for the hash table - * @ifindex: the network interface/device to hash - * - * Description: - * This is the hashing function for the unlabeled hash table, it returns the - * bucket number for the given device/interface. The caller is responsible for - * calling the rcu_read_[un]lock() functions. - * - */ -static u32 netlbl_unlhsh_hash(int ifindex) -{ - /* this is taken _almost_ directly from - * security/selinux/netif.c:sel_netif_hasfn() as they do pretty much - * the same thing */ - return ifindex & (rcu_dereference(netlbl_unlhsh)->size - 1); -} - -/** - * netlbl_unlhsh_search_addr4 - Search for a matching IPv4 address entry - * @addr: IPv4 address - * @iface: the network interface entry - * - * Description: - * Searches the IPv4 address list of the network interface specified by @iface. - * If a matching address entry is found it is returned, otherwise NULL is - * returned. The caller is responsible for calling the rcu_read_[un]lock() - * functions. - * - */ -static struct netlbl_unlhsh_addr4 *netlbl_unlhsh_search_addr4( - __be32 addr, - const struct netlbl_unlhsh_iface *iface) -{ - struct netlbl_unlhsh_addr4 *iter; - - list_for_each_entry_rcu(iter, &iface->addr4_list, list) - if (iter->valid && (addr & iter->mask) == iter->addr) - return iter; - - return NULL; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_search_addr6 - Search for a matching IPv6 address entry - * @addr: IPv6 address - * @iface: the network interface entry - * - * Description: - * Searches the IPv6 address list of the network interface specified by @iface. - * If a matching address entry is found it is returned, otherwise NULL is - * returned. The caller is responsible for calling the rcu_read_[un]lock() - * functions. - * - */ -static struct netlbl_unlhsh_addr6 *netlbl_unlhsh_search_addr6( - const struct in6_addr *addr, - const struct netlbl_unlhsh_iface *iface) -{ - struct netlbl_unlhsh_addr6 *iter; - - list_for_each_entry_rcu(iter, &iface->addr6_list, list) - if (iter->valid && - ipv6_masked_addr_cmp(&iter->addr, &iter->mask, addr) == 0) - return iter; - - return NULL; -} -#endif /* IPv6 */ - -/** - * netlbl_unlhsh_search_iface - Search for a matching interface entry - * @ifindex: the network interface - * - * Description: - * Searches the unlabeled connection hash table and returns a pointer to the - * interface entry which matches @ifindex, otherwise NULL is returned. The - * caller is responsible for calling the rcu_read_[un]lock() functions. - * - */ -static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) -{ - u32 bkt; - struct netlbl_unlhsh_iface *iter; - - bkt = netlbl_unlhsh_hash(ifindex); - list_for_each_entry_rcu(iter, - &rcu_dereference(netlbl_unlhsh)->tbl[bkt], - list) - if (iter->valid && iter->ifindex == ifindex) - return iter; - - return NULL; -} - -/** - * netlbl_unlhsh_search_iface_def - Search for a matching interface entry - * @ifindex: the network interface - * - * Description: - * Searches the unlabeled connection hash table and returns a pointer to the - * interface entry which matches @ifindex. If an exact match can not be found - * and there is a valid default entry, the default entry is returned, otherwise - * NULL is returned. The caller is responsible for calling the - * rcu_read_[un]lock() functions. - * - */ -static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface_def(int ifindex) -{ - struct netlbl_unlhsh_iface *entry; - - entry = netlbl_unlhsh_search_iface(ifindex); - if (entry != NULL) - return entry; - - entry = rcu_dereference(netlbl_unlhsh_def); - if (entry != NULL && entry->valid) - return entry; - - return NULL; -} - -/** - * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table - * @iface: the associated interface entry - * @addr: IPv4 address in network byte order - * @mask: IPv4 address mask in network byte order - * @secid: LSM secid value for entry - * - * Description: - * Add a new address entry into the unlabeled connection hash table using the - * interface entry specified by @iface. On success zero is returned, otherwise - * a negative value is returned. The caller is responsible for calling the - * rcu_read_[un]lock() functions. - * - */ -static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface, - const struct in_addr *addr, - const struct in_addr *mask, - u32 secid) -{ - struct netlbl_unlhsh_addr4 *entry; - struct netlbl_unlhsh_addr4 *iter; - - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) - return -ENOMEM; - - entry->addr = addr->s_addr & mask->s_addr; - entry->mask = mask->s_addr; - entry->secid = secid; - entry->valid = 1; - INIT_RCU_HEAD(&entry->rcu); - - spin_lock(&netlbl_unlhsh_lock); - iter = netlbl_unlhsh_search_addr4(entry->addr, iface); - if (iter != NULL && - iter->addr == addr->s_addr && iter->mask == mask->s_addr) { - spin_unlock(&netlbl_unlhsh_lock); - kfree(entry); - return -EEXIST; - } - /* in order to speed up address searches through the list (the common - * case) we need to keep the list in order based on the size of the - * address mask such that the entry with the widest mask (smallest - * numerical value) appears first in the list */ - list_for_each_entry_rcu(iter, &iface->addr4_list, list) - if (iter->valid && - ntohl(entry->mask) > ntohl(iter->mask)) { - __list_add_rcu(&entry->list, - iter->list.prev, - &iter->list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; - } - list_add_tail_rcu(&entry->list, &iface->addr4_list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table - * @iface: the associated interface entry - * @addr: IPv6 address in network byte order - * @mask: IPv6 address mask in network byte order - * @secid: LSM secid value for entry - * - * Description: - * Add a new address entry into the unlabeled connection hash table using the - * interface entry specified by @iface. On success zero is returned, otherwise - * a negative value is returned. The caller is responsible for calling the - * rcu_read_[un]lock() functions. - * - */ -static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface, - const struct in6_addr *addr, - const struct in6_addr *mask, - u32 secid) -{ - struct netlbl_unlhsh_addr6 *entry; - struct netlbl_unlhsh_addr6 *iter; - - entry = kzalloc(sizeof(*entry), GFP_ATOMIC); - if (entry == NULL) - return -ENOMEM; - - ipv6_addr_copy(&entry->addr, addr); - entry->addr.s6_addr32[0] &= mask->s6_addr32[0]; - entry->addr.s6_addr32[1] &= mask->s6_addr32[1]; - entry->addr.s6_addr32[2] &= mask->s6_addr32[2]; - entry->addr.s6_addr32[3] &= mask->s6_addr32[3]; - ipv6_addr_copy(&entry->mask, mask); - entry->secid = secid; - entry->valid = 1; - INIT_RCU_HEAD(&entry->rcu); - - spin_lock(&netlbl_unlhsh_lock); - iter = netlbl_unlhsh_search_addr6(&entry->addr, iface); - if (iter != NULL && - (ipv6_addr_equal(&iter->addr, addr) && - ipv6_addr_equal(&iter->mask, mask))) { - spin_unlock(&netlbl_unlhsh_lock); - kfree(entry); - return -EEXIST; - } - /* in order to speed up address searches through the list (the common - * case) we need to keep the list in order based on the size of the - * address mask such that the entry with the widest mask (smallest - * numerical value) appears first in the list */ - list_for_each_entry_rcu(iter, &iface->addr6_list, list) - if (iter->valid && - ipv6_addr_cmp(&entry->mask, &iter->mask) > 0) { - __list_add_rcu(&entry->list, - iter->list.prev, - &iter->list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; - } - list_add_tail_rcu(&entry->list, &iface->addr6_list); - spin_unlock(&netlbl_unlhsh_lock); - return 0; -} -#endif /* IPv6 */ - -/** - * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table - * @ifindex: network interface - * - * Description: - * Add a new, empty, interface entry into the unlabeled connection hash table. - * On success a pointer to the new interface entry is returned, on failure NULL - * is returned. The caller is responsible for calling the rcu_read_[un]lock() - * functions. - * - */ -static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex) -{ - u32 bkt; - struct netlbl_unlhsh_iface *iface; - - iface = kzalloc(sizeof(*iface), GFP_ATOMIC); - if (iface == NULL) - return NULL; - - iface->ifindex = ifindex; - INIT_LIST_HEAD(&iface->addr4_list); - INIT_LIST_HEAD(&iface->addr6_list); - iface->valid = 1; - INIT_RCU_HEAD(&iface->rcu); - - spin_lock(&netlbl_unlhsh_lock); - if (ifindex > 0) { - bkt = netlbl_unlhsh_hash(ifindex); - if (netlbl_unlhsh_search_iface(ifindex) != NULL) - goto add_iface_failure; - list_add_tail_rcu(&iface->list, - &rcu_dereference(netlbl_unlhsh)->tbl[bkt]); - } else { - INIT_LIST_HEAD(&iface->list); - if (rcu_dereference(netlbl_unlhsh_def) != NULL) - goto add_iface_failure; - rcu_assign_pointer(netlbl_unlhsh_def, iface); - } - spin_unlock(&netlbl_unlhsh_lock); - - return iface; - -add_iface_failure: - spin_unlock(&netlbl_unlhsh_lock); - kfree(iface); - return NULL; -} - -/** - * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table - * @net: network namespace - * @dev_name: interface name - * @addr: IP address in network byte order - * @mask: address mask in network byte order - * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) - * @secid: LSM secid value for the entry - * @audit_info: NetLabel audit information - * - * Description: - * Adds a new entry to the unlabeled connection hash table. Returns zero on - * success, negative values on failure. - * - */ -static int netlbl_unlhsh_add(struct net *net, - const char *dev_name, - const void *addr, - const void *mask, - u32 addr_len, - u32 secid, - struct netlbl_audit *audit_info) -{ - int ret_val; - int ifindex; - struct net_device *dev; - struct netlbl_unlhsh_iface *iface; - struct in_addr *addr4, *mask4; - struct in6_addr *addr6, *mask6; - struct audit_buffer *audit_buf = NULL; - char *secctx = NULL; - u32 secctx_len; - - if (addr_len != sizeof(struct in_addr) && - addr_len != sizeof(struct in6_addr)) - return -EINVAL; - - rcu_read_lock(); - if (dev_name != NULL) { - dev = dev_get_by_name(net, dev_name); - if (dev == NULL) { - ret_val = -ENODEV; - goto unlhsh_add_return; - } - ifindex = dev->ifindex; - dev_put(dev); - iface = netlbl_unlhsh_search_iface(ifindex); - } else { - ifindex = 0; - iface = rcu_dereference(netlbl_unlhsh_def); - } - if (iface == NULL) { - iface = netlbl_unlhsh_add_iface(ifindex); - if (iface == NULL) { - ret_val = -ENOMEM; - goto unlhsh_add_return; - } - } - audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD, - audit_info); - switch (addr_len) { - case sizeof(struct in_addr): - addr4 = (struct in_addr *)addr; - mask4 = (struct in_addr *)mask; - ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid); - if (audit_buf != NULL) - netlbl_unlabel_audit_addr4(audit_buf, - dev_name, - addr4->s_addr, - mask4->s_addr); - break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case sizeof(struct in6_addr): - addr6 = (struct in6_addr *)addr; - mask6 = (struct in6_addr *)mask; - ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid); - if (audit_buf != NULL) - netlbl_unlabel_audit_addr6(audit_buf, - dev_name, - addr6, mask6); - break; -#endif /* IPv6 */ - default: - ret_val = -EINVAL; - } - if (ret_val == 0) - atomic_inc(&netlabel_mgmt_protocount); - -unlhsh_add_return: - rcu_read_unlock(); - if (audit_buf != NULL) { - if (security_secid_to_secctx(secid, - &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); - } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); - audit_log_end(audit_buf); - } - return ret_val; -} - -/** - * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry - * @net: network namespace - * @iface: interface entry - * @addr: IP address - * @mask: IP address mask - * @audit_info: NetLabel audit information - * - * Description: - * Remove an IP address entry from the unlabeled connection hash table. - * Returns zero on success, negative values on failure. The caller is - * responsible for calling the rcu_read_[un]lock() functions. - * - */ -static int netlbl_unlhsh_remove_addr4(struct net *net, - struct netlbl_unlhsh_iface *iface, - const struct in_addr *addr, - const struct in_addr *mask, - struct netlbl_audit *audit_info) -{ - int ret_val = -ENOENT; - struct netlbl_unlhsh_addr4 *entry; - struct audit_buffer *audit_buf = NULL; - struct net_device *dev; - char *secctx = NULL; - u32 secctx_len; - - spin_lock(&netlbl_unlhsh_lock); - entry = netlbl_unlhsh_search_addr4(addr->s_addr, iface); - if (entry != NULL && - entry->addr == addr->s_addr && entry->mask == mask->s_addr) { - entry->valid = 0; - list_del_rcu(&entry->list); - ret_val = 0; - } - spin_unlock(&netlbl_unlhsh_lock); - - audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, - audit_info); - if (audit_buf != NULL) { - dev = dev_get_by_index(net, iface->ifindex); - netlbl_unlabel_audit_addr4(audit_buf, - (dev != NULL ? dev->name : NULL), - entry->addr, entry->mask); - if (dev != NULL) - dev_put(dev); - if (security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); - } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); - audit_log_end(audit_buf); - } - - if (ret_val == 0) - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr4); - return ret_val; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -/** - * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry - * @net: network namespace - * @iface: interface entry - * @addr: IP address - * @mask: IP address mask - * @audit_info: NetLabel audit information - * - * Description: - * Remove an IP address entry from the unlabeled connection hash table. - * Returns zero on success, negative values on failure. The caller is - * responsible for calling the rcu_read_[un]lock() functions. - * - */ -static int netlbl_unlhsh_remove_addr6(struct net *net, - struct netlbl_unlhsh_iface *iface, - const struct in6_addr *addr, - const struct in6_addr *mask, - struct netlbl_audit *audit_info) -{ - int ret_val = -ENOENT; - struct netlbl_unlhsh_addr6 *entry; - struct audit_buffer *audit_buf = NULL; - struct net_device *dev; - char *secctx = NULL; - u32 secctx_len; - - spin_lock(&netlbl_unlhsh_lock); - entry = netlbl_unlhsh_search_addr6(addr, iface); - if (entry != NULL && - (ipv6_addr_equal(&entry->addr, addr) && - ipv6_addr_equal(&entry->mask, mask))) { - entry->valid = 0; - list_del_rcu(&entry->list); - ret_val = 0; - } - spin_unlock(&netlbl_unlhsh_lock); - - audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL, - audit_info); - if (audit_buf != NULL) { - dev = dev_get_by_index(net, iface->ifindex); - netlbl_unlabel_audit_addr6(audit_buf, - (dev != NULL ? dev->name : NULL), - addr, mask); - if (dev != NULL) - dev_put(dev); - if (security_secid_to_secctx(entry->secid, - &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); - } - audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); - audit_log_end(audit_buf); - } - - if (ret_val == 0) - call_rcu(&entry->rcu, netlbl_unlhsh_free_addr6); - return ret_val; -} -#endif /* IPv6 */ - -/** - * netlbl_unlhsh_condremove_iface - Remove an interface entry - * @iface: the interface entry - * - * Description: - * Remove an interface entry from the unlabeled connection hash table if it is - * empty. An interface entry is considered to be empty if there are no - * address entries assigned to it. - * - */ -static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface) -{ - struct netlbl_unlhsh_addr4 *iter4; - struct netlbl_unlhsh_addr6 *iter6; - - spin_lock(&netlbl_unlhsh_lock); - list_for_each_entry_rcu(iter4, &iface->addr4_list, list) - if (iter4->valid) - goto unlhsh_condremove_failure; - list_for_each_entry_rcu(iter6, &iface->addr6_list, list) - if (iter6->valid) - goto unlhsh_condremove_failure; - iface->valid = 0; - if (iface->ifindex > 0) - list_del_rcu(&iface->list); - else - rcu_assign_pointer(netlbl_unlhsh_def, NULL); - spin_unlock(&netlbl_unlhsh_lock); - - call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); - return; - -unlhsh_condremove_failure: - spin_unlock(&netlbl_unlhsh_lock); - return; -} - -/** - * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table - * @net: network namespace - * @dev_name: interface name - * @addr: IP address in network byte order - * @mask: address mask in network byte order - * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6) - * @audit_info: NetLabel audit information - * - * Description: - * Removes and existing entry from the unlabeled connection hash table. - * Returns zero on success, negative values on failure. - * - */ -static int netlbl_unlhsh_remove(struct net *net, - const char *dev_name, - const void *addr, - const void *mask, - u32 addr_len, - struct netlbl_audit *audit_info) -{ - int ret_val; - struct net_device *dev; - struct netlbl_unlhsh_iface *iface; - - if (addr_len != sizeof(struct in_addr) && - addr_len != sizeof(struct in6_addr)) - return -EINVAL; - - rcu_read_lock(); - if (dev_name != NULL) { - dev = dev_get_by_name(net, dev_name); - if (dev == NULL) { - ret_val = -ENODEV; - goto unlhsh_remove_return; - } - iface = netlbl_unlhsh_search_iface(dev->ifindex); - dev_put(dev); - } else - iface = rcu_dereference(netlbl_unlhsh_def); - if (iface == NULL) { - ret_val = -ENOENT; - goto unlhsh_remove_return; - } - switch (addr_len) { - case sizeof(struct in_addr): - ret_val = netlbl_unlhsh_remove_addr4(net, - iface, addr, mask, - audit_info); - break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case sizeof(struct in6_addr): - ret_val = netlbl_unlhsh_remove_addr6(net, - iface, addr, mask, - audit_info); - break; -#endif /* IPv6 */ - default: - ret_val = -EINVAL; - } - if (ret_val == 0) { - netlbl_unlhsh_condremove_iface(iface); - atomic_dec(&netlabel_mgmt_protocount); - } - -unlhsh_remove_return: - rcu_read_unlock(); - return ret_val; -} - -/* - * General Helper Functions - */ - -/** - * netlbl_unlhsh_netdev_handler - Network device notification handler - * @this: notifier block - * @event: the event - * @ptr: the network device (cast to void) - * - * Description: - * Handle network device events, although at present all we care about is a - * network device going away. In the case of a device going away we clear any - * related entries from the unlabeled connection hash table. - * - */ -static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, - unsigned long event, - void *ptr) -{ - struct net_device *dev = ptr; - struct netlbl_unlhsh_iface *iface = NULL; +#include "netlabel_user.h" +#include "netlabel_domainhash.h" +#include "netlabel_unlabeled.h" - if (dev->nd_net != &init_net) - return NOTIFY_DONE; +/* Accept unlabeled packets flag */ +static DEFINE_SPINLOCK(netlabel_unlabel_acceptflg_lock); +static u8 netlabel_unlabel_acceptflg = 0; - /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */ - if (event == NETDEV_DOWN) { - spin_lock(&netlbl_unlhsh_lock); - iface = netlbl_unlhsh_search_iface(dev->ifindex); - if (iface != NULL && iface->valid) { - iface->valid = 0; - list_del_rcu(&iface->list); - } else - iface = NULL; - spin_unlock(&netlbl_unlhsh_lock); - } +/* NetLabel Generic NETLINK CIPSOv4 family */ +static struct genl_family netlbl_unlabel_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_UNLABELED_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_UNLABEL_A_MAX, +}; - if (iface != NULL) - call_rcu(&iface->rcu, netlbl_unlhsh_free_iface); +/* NetLabel Netlink attribute policy */ +static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { + [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 }, +}; - return NOTIFY_DONE; -} +/* + * Helper Functions + */ /** * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag @@ -984,8 +84,11 @@ static void netlbl_unlabel_acceptflg_set(u8 value, struct audit_buffer *audit_buf; u8 old_val; + spin_lock(&netlabel_unlabel_acceptflg_lock); old_val = netlabel_unlabel_acceptflg; netlabel_unlabel_acceptflg = value; + spin_unlock(&netlabel_unlabel_acceptflg_lock); + audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW, audit_info); if (audit_buf != NULL) { @@ -995,48 +98,6 @@ static void netlbl_unlabel_acceptflg_set(u8 value, } } -/** - * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information - * @info: the Generic NETLINK info block - * @addr: the IP address - * @mask: the IP address mask - * @len: the address length - * - * Description: - * Examine the Generic NETLINK message and extract the IP address information. - * Returns zero on success, negative values on failure. - * - */ -static int netlbl_unlabel_addrinfo_get(struct genl_info *info, - void **addr, - void **mask, - u32 *len) -{ - u32 addr_len; - - if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { - addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); - if (addr_len != sizeof(struct in_addr) && - addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) - return -EINVAL; - *len = addr_len; - *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); - *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]); - return 0; - } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) { - addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); - if (addr_len != sizeof(struct in6_addr) && - addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK])) - return -EINVAL; - *len = addr_len; - *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]); - *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]); - return 0; - } - - return -EINVAL; -} - /* * NetLabel Command Handlers */ @@ -1094,9 +155,11 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) goto list_failure; } + rcu_read_lock(); ret_val = nla_put_u8(ans_skb, NLBL_UNLABEL_A_ACPTFLG, netlabel_unlabel_acceptflg); + rcu_read_unlock(); if (ret_val != 0) goto list_failure; @@ -1112,489 +175,11 @@ static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info) return ret_val; } -/** - * netlbl_unlabel_staticadd - Handle a STATICADD message - * @skb: the NETLINK buffer - * @info: the Generic NETLINK info block - * - * Description: - * Process a user generated STATICADD message and add a new unlabeled - * connection entry to the hash table. Returns zero on success, negative - * values on failure. - * - */ -static int netlbl_unlabel_staticadd(struct sk_buff *skb, - struct genl_info *info) -{ - int ret_val; - char *dev_name; - void *addr; - void *mask; - u32 addr_len; - u32 secid; - struct netlbl_audit audit_info; - - /* Don't allow users to add both IPv4 and IPv6 addresses for a - * single entry. However, allow users to create two entries, one each - * for IPv4 and IPv4, with the same LSM security context which should - * achieve the same result. */ - if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || - !info->attrs[NLBL_UNLABEL_A_IFACE] || - !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ - (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) - return -EINVAL; - - netlbl_netlink_auditinfo(skb, &audit_info); - - ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); - if (ret_val != 0) - return ret_val; - dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); - ret_val = security_secctx_to_secid( - nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), - nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), - &secid); - if (ret_val != 0) - return ret_val; - - return netlbl_unlhsh_add(&init_net, - dev_name, addr, mask, addr_len, secid, - &audit_info); -} - -/** - * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message - * @skb: the NETLINK buffer - * @info: the Generic NETLINK info block - * - * Description: - * Process a user generated STATICADDDEF message and add a new default - * unlabeled connection entry. Returns zero on success, negative values on - * failure. - * - */ -static int netlbl_unlabel_staticadddef(struct sk_buff *skb, - struct genl_info *info) -{ - int ret_val; - void *addr; - void *mask; - u32 addr_len; - u32 secid; - struct netlbl_audit audit_info; - - /* Don't allow users to add both IPv4 and IPv6 addresses for a - * single entry. However, allow users to create two entries, one each - * for IPv4 and IPv6, with the same LSM security context which should - * achieve the same result. */ - if (!info->attrs[NLBL_UNLABEL_A_SECCTX] || - !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ - (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) - return -EINVAL; - - netlbl_netlink_auditinfo(skb, &audit_info); - - ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); - if (ret_val != 0) - return ret_val; - ret_val = security_secctx_to_secid( - nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]), - nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]), - &secid); - if (ret_val != 0) - return ret_val; - - return netlbl_unlhsh_add(&init_net, - NULL, addr, mask, addr_len, secid, - &audit_info); -} - -/** - * netlbl_unlabel_staticremove - Handle a STATICREMOVE message - * @skb: the NETLINK buffer - * @info: the Generic NETLINK info block - * - * Description: - * Process a user generated STATICREMOVE message and remove the specified - * unlabeled connection entry. Returns zero on success, negative values on - * failure. - * - */ -static int netlbl_unlabel_staticremove(struct sk_buff *skb, - struct genl_info *info) -{ - int ret_val; - char *dev_name; - void *addr; - void *mask; - u32 addr_len; - struct netlbl_audit audit_info; - - /* See the note in netlbl_unlabel_staticadd() about not allowing both - * IPv4 and IPv6 in the same entry. */ - if (!info->attrs[NLBL_UNLABEL_A_IFACE] || - !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ - (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) - return -EINVAL; - - netlbl_netlink_auditinfo(skb, &audit_info); - - ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); - if (ret_val != 0) - return ret_val; - dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]); - - return netlbl_unlhsh_remove(&init_net, - dev_name, addr, mask, addr_len, - &audit_info); -} - -/** - * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message - * @skb: the NETLINK buffer - * @info: the Generic NETLINK info block - * - * Description: - * Process a user generated STATICREMOVEDEF message and remove the default - * unlabeled connection entry. Returns zero on success, negative values on - * failure. - * - */ -static int netlbl_unlabel_staticremovedef(struct sk_buff *skb, - struct genl_info *info) -{ - int ret_val; - void *addr; - void *mask; - u32 addr_len; - struct netlbl_audit audit_info; - - /* See the note in netlbl_unlabel_staticadd() about not allowing both - * IPv4 and IPv6 in the same entry. */ - if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^ - (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] || - !info->attrs[NLBL_UNLABEL_A_IPV6MASK]))) - return -EINVAL; - - netlbl_netlink_auditinfo(skb, &audit_info); - - ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len); - if (ret_val != 0) - return ret_val; - - return netlbl_unlhsh_remove(&init_net, - NULL, addr, mask, addr_len, - &audit_info); -} - - -/** - * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF] - * @cmd: command/message - * @iface: the interface entry - * @addr4: the IPv4 address entry - * @addr6: the IPv6 address entry - * @arg: the netlbl_unlhsh_walk_arg structure - * - * Description: - * This function is designed to be used to generate a response for a - * STATICLIST or STATICLISTDEF message. When called either @addr4 or @addr6 - * can be specified, not both, the other unspecified entry should be set to - * NULL by the caller. Returns the size of the message on success, negative - * values on failure. - * - */ -static int netlbl_unlabel_staticlist_gen(u32 cmd, - const struct netlbl_unlhsh_iface *iface, - const struct netlbl_unlhsh_addr4 *addr4, - const struct netlbl_unlhsh_addr6 *addr6, - void *arg) -{ - int ret_val = -ENOMEM; - struct netlbl_unlhsh_walk_arg *cb_arg = arg; - struct net_device *dev; - void *data; - u32 secid; - char *secctx; - u32 secctx_len; - - data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).pid, - cb_arg->seq, &netlbl_unlabel_gnl_family, - NLM_F_MULTI, cmd); - if (data == NULL) - goto list_cb_failure; - - if (iface->ifindex > 0) { - dev = dev_get_by_index(&init_net, iface->ifindex); - ret_val = nla_put_string(cb_arg->skb, - NLBL_UNLABEL_A_IFACE, dev->name); - dev_put(dev); - if (ret_val != 0) - goto list_cb_failure; - } - - if (addr4) { - struct in_addr addr_struct; - - addr_struct.s_addr = addr4->addr; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV4ADDR, - sizeof(struct in_addr), - &addr_struct); - if (ret_val != 0) - goto list_cb_failure; - - addr_struct.s_addr = addr4->mask; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV4MASK, - sizeof(struct in_addr), - &addr_struct); - if (ret_val != 0) - goto list_cb_failure; - - secid = addr4->secid; - } else { - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV6ADDR, - sizeof(struct in6_addr), - &addr6->addr); - if (ret_val != 0) - goto list_cb_failure; - - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_IPV6MASK, - sizeof(struct in6_addr), - &addr6->mask); - if (ret_val != 0) - goto list_cb_failure; - - secid = addr6->secid; - } - - ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); - if (ret_val != 0) - goto list_cb_failure; - ret_val = nla_put(cb_arg->skb, - NLBL_UNLABEL_A_SECCTX, - secctx_len, - secctx); - security_release_secctx(secctx, secctx_len); - if (ret_val != 0) - goto list_cb_failure; - - cb_arg->seq++; - return genlmsg_end(cb_arg->skb, data); - -list_cb_failure: - genlmsg_cancel(cb_arg->skb, data); - return ret_val; -} - -/** - * netlbl_unlabel_staticlist - Handle a STATICLIST message - * @skb: the NETLINK buffer - * @cb: the NETLINK callback - * - * Description: - * Process a user generated STATICLIST message and dump the unlabeled - * connection hash table in a form suitable for use in a kernel generated - * STATICLIST message. Returns the length of @skb. - * - */ -static int netlbl_unlabel_staticlist(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct netlbl_unlhsh_walk_arg cb_arg; - u32 skip_bkt = cb->args[0]; - u32 skip_chain = cb->args[1]; - u32 skip_addr4 = cb->args[2]; - u32 skip_addr6 = cb->args[3]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; - struct netlbl_unlhsh_iface *iface; - struct netlbl_unlhsh_addr4 *addr4; - struct netlbl_unlhsh_addr6 *addr6; - - cb_arg.nl_cb = cb; - cb_arg.skb = skb; - cb_arg.seq = cb->nlh->nlmsg_seq; - - rcu_read_lock(); - for (iter_bkt = skip_bkt; - iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { - list_for_each_entry_rcu(iface, - &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt], - list) { - if (!iface->valid || - iter_chain++ < skip_chain) - continue; - list_for_each_entry_rcu(addr4, - &iface->addr4_list, - list) { - if (!addr4->valid || iter_addr4++ < skip_addr4) - continue; - if (netlbl_unlabel_staticlist_gen( - NLBL_UNLABEL_C_STATICLIST, - iface, - addr4, - NULL, - &cb_arg) < 0) { - iter_addr4--; - iter_chain--; - goto unlabel_staticlist_return; - } - } - list_for_each_entry_rcu(addr6, - &iface->addr6_list, - list) { - if (!addr6->valid || iter_addr6++ < skip_addr6) - continue; - if (netlbl_unlabel_staticlist_gen( - NLBL_UNLABEL_C_STATICLIST, - iface, - NULL, - addr6, - &cb_arg) < 0) { - iter_addr6--; - iter_chain--; - goto unlabel_staticlist_return; - } - } - } - } - -unlabel_staticlist_return: - rcu_read_unlock(); - cb->args[0] = skip_bkt; - cb->args[1] = skip_chain; - cb->args[2] = skip_addr4; - cb->args[3] = skip_addr6; - return skb->len; -} - -/** - * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message - * @skb: the NETLINK buffer - * @cb: the NETLINK callback - * - * Description: - * Process a user generated STATICLISTDEF message and dump the default - * unlabeled connection entry in a form suitable for use in a kernel generated - * STATICLISTDEF message. Returns the length of @skb. - * - */ -static int netlbl_unlabel_staticlistdef(struct sk_buff *skb, - struct netlink_callback *cb) -{ - struct netlbl_unlhsh_walk_arg cb_arg; - struct netlbl_unlhsh_iface *iface; - u32 skip_addr4 = cb->args[0]; - u32 skip_addr6 = cb->args[1]; - u32 iter_addr4 = 0, iter_addr6 = 0; - struct netlbl_unlhsh_addr4 *addr4; - struct netlbl_unlhsh_addr6 *addr6; - - cb_arg.nl_cb = cb; - cb_arg.skb = skb; - cb_arg.seq = cb->nlh->nlmsg_seq; - - rcu_read_lock(); - iface = rcu_dereference(netlbl_unlhsh_def); - if (iface == NULL || !iface->valid) - goto unlabel_staticlistdef_return; - - list_for_each_entry_rcu(addr4, &iface->addr4_list, list) { - if (!addr4->valid || iter_addr4++ < skip_addr4) - continue; - if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, - iface, - addr4, - NULL, - &cb_arg) < 0) { - iter_addr4--; - goto unlabel_staticlistdef_return; - } - } - list_for_each_entry_rcu(addr6, &iface->addr6_list, list) { - if (addr6->valid || iter_addr6++ < skip_addr6) - continue; - if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF, - iface, - NULL, - addr6, - &cb_arg) < 0) { - iter_addr6--; - goto unlabel_staticlistdef_return; - } - } - -unlabel_staticlistdef_return: - rcu_read_unlock(); - cb->args[0] = skip_addr4; - cb->args[1] = skip_addr6; - return skb->len; -} /* * NetLabel Generic NETLINK Command Definitions */ -static struct genl_ops netlbl_unlabel_genl_c_staticadd = { - .cmd = NLBL_UNLABEL_C_STATICADD, - .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, - .doit = netlbl_unlabel_staticadd, - .dumpit = NULL, -}; - -static struct genl_ops netlbl_unlabel_genl_c_staticremove = { - .cmd = NLBL_UNLABEL_C_STATICREMOVE, - .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, - .doit = netlbl_unlabel_staticremove, - .dumpit = NULL, -}; - -static struct genl_ops netlbl_unlabel_genl_c_staticlist = { - .cmd = NLBL_UNLABEL_C_STATICLIST, - .flags = 0, - .policy = netlbl_unlabel_genl_policy, - .doit = NULL, - .dumpit = netlbl_unlabel_staticlist, -}; - -static struct genl_ops netlbl_unlabel_genl_c_staticadddef = { - .cmd = NLBL_UNLABEL_C_STATICADDDEF, - .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, - .doit = netlbl_unlabel_staticadddef, - .dumpit = NULL, -}; - -static struct genl_ops netlbl_unlabel_genl_c_staticremovedef = { - .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF, - .flags = GENL_ADMIN_PERM, - .policy = netlbl_unlabel_genl_policy, - .doit = netlbl_unlabel_staticremovedef, - .dumpit = NULL, -}; - -static struct genl_ops netlbl_unlabel_genl_c_staticlistdef = { - .cmd = NLBL_UNLABEL_C_STATICLISTDEF, - .flags = 0, - .policy = netlbl_unlabel_genl_policy, - .doit = NULL, - .dumpit = netlbl_unlabel_staticlistdef, -}; - static struct genl_ops netlbl_unlabel_genl_c_accept = { .cmd = NLBL_UNLABEL_C_ACCEPT, .flags = GENL_ADMIN_PERM, @@ -1611,6 +196,7 @@ static struct genl_ops netlbl_unlabel_genl_c_list = { .dumpit = NULL, }; + /* * NetLabel Generic NETLINK Protocol Functions */ @@ -1631,36 +217,6 @@ int netlbl_unlabel_genl_init(void) if (ret_val != 0) return ret_val; - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticadd); - if (ret_val != 0) - return ret_val; - - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticremove); - if (ret_val != 0) - return ret_val; - - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticlist); - if (ret_val != 0) - return ret_val; - - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticadddef); - if (ret_val != 0) - return ret_val; - - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticremovedef); - if (ret_val != 0) - return ret_val; - - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, - &netlbl_unlabel_genl_c_staticlistdef); - if (ret_val != 0) - return ret_val; - ret_val = genl_register_ops(&netlbl_unlabel_gnl_family, &netlbl_unlabel_genl_c_accept); if (ret_val != 0) @@ -1678,58 +234,8 @@ int netlbl_unlabel_genl_init(void) * NetLabel KAPI Hooks */ -static struct notifier_block netlbl_unlhsh_netdev_notifier = { - .notifier_call = netlbl_unlhsh_netdev_handler, -}; - -/** - * netlbl_unlabel_init - Initialize the unlabeled connection hash table - * @size: the number of bits to use for the hash buckets - * - * Description: - * Initializes the unlabeled connection hash table and registers a network - * device notification handler. This function should only be called by the - * NetLabel subsystem itself during initialization. Returns zero on success, - * non-zero values on error. - * - */ -int netlbl_unlabel_init(u32 size) -{ - u32 iter; - struct netlbl_unlhsh_tbl *hsh_tbl; - - if (size == 0) - return -EINVAL; - - hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); - if (hsh_tbl == NULL) - return -ENOMEM; - hsh_tbl->size = 1 << size; - hsh_tbl->tbl = kcalloc(hsh_tbl->size, - sizeof(struct list_head), - GFP_KERNEL); - if (hsh_tbl->tbl == NULL) { - kfree(hsh_tbl); - return -ENOMEM; - } - for (iter = 0; iter < hsh_tbl->size; iter++) - INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); - - rcu_read_lock(); - spin_lock(&netlbl_unlhsh_lock); - rcu_assign_pointer(netlbl_unlhsh, hsh_tbl); - spin_unlock(&netlbl_unlhsh_lock); - rcu_read_unlock(); - - register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier); - - return 0; -} - /** * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet - * @skb: the packet - * @family: protocol family * @secattr: the security attributes * * Description: @@ -1737,52 +243,19 @@ int netlbl_unlabel_init(u32 size) * them in @secattr. Returns zero on success and negative values on failure. * */ -int netlbl_unlabel_getattr(const struct sk_buff *skb, - u16 family, - struct netlbl_lsm_secattr *secattr) +int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr) { - struct iphdr *hdr4; - struct ipv6hdr *hdr6; - struct netlbl_unlhsh_addr4 *addr4; - struct netlbl_unlhsh_addr6 *addr6; - struct netlbl_unlhsh_iface *iface; + int ret_val; rcu_read_lock(); - iface = netlbl_unlhsh_search_iface_def(skb->iif); - if (iface == NULL) - goto unlabel_getattr_nolabel; - switch (family) { - case PF_INET: - hdr4 = ip_hdr(skb); - addr4 = netlbl_unlhsh_search_addr4(hdr4->saddr, iface); - if (addr4 == NULL) - goto unlabel_getattr_nolabel; - secattr->attr.secid = addr4->secid; - break; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - case PF_INET6: - hdr6 = ipv6_hdr(skb); - addr6 = netlbl_unlhsh_search_addr6(&hdr6->saddr, iface); - if (addr6 == NULL) - goto unlabel_getattr_nolabel; - secattr->attr.secid = addr6->secid; - break; -#endif /* IPv6 */ - default: - goto unlabel_getattr_nolabel; - } + if (netlabel_unlabel_acceptflg == 1) { + netlbl_secattr_init(secattr); + ret_val = 0; + } else + ret_val = -ENOMSG; rcu_read_unlock(); - secattr->flags |= NETLBL_SECATTR_SECID; - secattr->type = NETLBL_NLTYPE_UNLABELED; - return 0; - -unlabel_getattr_nolabel: - rcu_read_unlock(); - if (netlabel_unlabel_acceptflg == 0) - return -ENOMSG; - secattr->type = NETLBL_NLTYPE_UNLABELED; - return 0; + return ret_val; } /** diff --git a/trunk/net/netlabel/netlabel_unlabeled.h b/trunk/net/netlabel/netlabel_unlabeled.h index 06b1301ac072..c2917fbb42cf 100644 --- a/trunk/net/netlabel/netlabel_unlabeled.h +++ b/trunk/net/netlabel/netlabel_unlabeled.h @@ -36,116 +36,6 @@ /* * The following NetLabel payloads are supported by the Unlabeled subsystem. * - * o STATICADD - * This message is sent from an application to add a new static label for - * incoming unlabeled connections. - * - * Required attributes: - * - * NLBL_UNLABEL_A_IFACE - * NLBL_UNLABEL_A_SECCTX - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * - * o STATICREMOVE - * This message is sent from an application to remove an existing static - * label for incoming unlabeled connections. - * - * Required attributes: - * - * NLBL_UNLABEL_A_IFACE - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * - * o STATICLIST - * This message can be sent either from an application or by the kernel in - * response to an application generated STATICLIST message. When sent by an - * application there is no payload and the NLM_F_DUMP flag should be set. - * The kernel should response with a series of the following messages. - * - * Required attributes: - * - * NLBL_UNLABEL_A_IFACE - * NLBL_UNLABEL_A_SECCTX - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * - * o STATICADDDEF - * This message is sent from an application to set the default static - * label for incoming unlabeled connections. - * - * Required attribute: - * - * NLBL_UNLABEL_A_SECCTX - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * - * o STATICREMOVEDEF - * This message is sent from an application to remove the existing default - * static label for incoming unlabeled connections. - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * - * o STATICLISTDEF - * This message can be sent either from an application or by the kernel in - * response to an application generated STATICLISTDEF message. When sent by - * an application there is no payload and the NLM_F_DUMP flag should be set. - * The kernel should response with the following message. - * - * Required attribute: - * - * NLBL_UNLABEL_A_SECCTX - * - * If IPv4 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV4ADDR - * NLBL_UNLABEL_A_IPV4MASK - * - * If IPv6 is specified the following attributes are required: - * - * NLBL_UNLABEL_A_IPV6ADDR - * NLBL_UNLABEL_A_IPV6MASK - * * o ACCEPT * This message is sent from an application to specify if the kernel should * allow unlabled packets to pass if they do not match any of the static @@ -172,12 +62,6 @@ enum { NLBL_UNLABEL_C_UNSPEC, NLBL_UNLABEL_C_ACCEPT, NLBL_UNLABEL_C_LIST, - NLBL_UNLABEL_C_STATICADD, - NLBL_UNLABEL_C_STATICREMOVE, - NLBL_UNLABEL_C_STATICLIST, - NLBL_UNLABEL_C_STATICADDDEF, - NLBL_UNLABEL_C_STATICREMOVEDEF, - NLBL_UNLABEL_C_STATICLISTDEF, __NLBL_UNLABEL_C_MAX, }; #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1) @@ -189,24 +73,6 @@ enum { /* (NLA_U8) * if true then unlabeled packets are allowed to pass, else unlabeled * packets are rejected */ - NLBL_UNLABEL_A_IPV6ADDR, - /* (NLA_BINARY, struct in6_addr) - * an IPv6 address */ - NLBL_UNLABEL_A_IPV6MASK, - /* (NLA_BINARY, struct in6_addr) - * an IPv6 address mask */ - NLBL_UNLABEL_A_IPV4ADDR, - /* (NLA_BINARY, struct in_addr) - * an IPv4 address */ - NLBL_UNLABEL_A_IPV4MASK, - /* (NLA_BINARY, struct in_addr) - * and IPv4 address mask */ - NLBL_UNLABEL_A_IFACE, - /* (NLA_NULL_STRING) - * network interface */ - NLBL_UNLABEL_A_SECCTX, - /* (NLA_BINARY) - * a LSM specific security context */ __NLBL_UNLABEL_A_MAX, }; #define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1) @@ -214,17 +80,8 @@ enum { /* NetLabel protocol functions */ int netlbl_unlabel_genl_init(void); -/* Unlabeled connection hash table size */ -/* XXX - currently this number is an uneducated guess */ -#define NETLBL_UNLHSH_BITSIZE 7 - -/* General Unlabeled init function */ -int netlbl_unlabel_init(u32 size); - /* Process Unlabeled incoming network packets */ -int netlbl_unlabel_getattr(const struct sk_buff *skb, - u16 family, - struct netlbl_lsm_secattr *secattr); +int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr); /* Set the default configuration to allow Unlabeled packets */ int netlbl_unlabel_defconf(void); diff --git a/trunk/security/Kconfig b/trunk/security/Kconfig index 389e151e3b68..8086e61058e3 100644 --- a/trunk/security/Kconfig +++ b/trunk/security/Kconfig @@ -76,7 +76,6 @@ config SECURITY_NETWORK_XFRM config SECURITY_CAPABILITIES bool "Default Linux Capabilities" depends on SECURITY - default y help This enables the "default" Linux capabilities functionality. If you are unsure how to answer this question, answer Y. diff --git a/trunk/security/selinux/Kconfig b/trunk/security/selinux/Kconfig index 2b517d618672..b32a459c0683 100644 --- a/trunk/security/selinux/Kconfig +++ b/trunk/security/selinux/Kconfig @@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE int "NSA SELinux maximum supported policy format version value" depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX - range 15 22 + range 15 21 default 19 help This option sets the value for the maximum policy format version diff --git a/trunk/security/selinux/Makefile b/trunk/security/selinux/Makefile index 00afd85f1edb..dc3502e30b19 100644 --- a/trunk/security/selinux/Makefile +++ b/trunk/security/selinux/Makefile @@ -4,14 +4,7 @@ obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ -selinux-y := avc.o \ - hooks.o \ - selinuxfs.o \ - netlink.o \ - nlmsgtab.o \ - netif.o \ - netnode.o \ - exports.o +selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o diff --git a/trunk/security/selinux/avc.c b/trunk/security/selinux/avc.c index e8529e2f51e5..81b3dff3cbf0 100644 --- a/trunk/security/selinux/avc.c +++ b/trunk/security/selinux/avc.c @@ -661,18 +661,9 @@ void avc_audit(u32 ssid, u32 tsid, "daddr", "dest"); break; } - if (a->u.net.netif > 0) { - struct net_device *dev; - - /* NOTE: we always use init's namespace */ - dev = dev_get_by_index(&init_net, - a->u.net.netif); - if (dev) { - audit_log_format(ab, " netif=%s", - dev->name); - dev_put(dev); - } - } + if (a->u.net.netif) + audit_log_format(ab, " netif=%s", + a->u.net.netif); break; } } diff --git a/trunk/security/selinux/exports.c b/trunk/security/selinux/exports.c index 87d2bb3ea355..b6f96943be1f 100644 --- a/trunk/security/selinux/exports.c +++ b/trunk/security/selinux/exports.c @@ -17,14 +17,10 @@ #include #include #include -#include #include "security.h" #include "objsec.h" -/* SECMARK reference count */ -extern atomic_t selinux_secmark_refcount; - int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen) { if (selinux_enabled) @@ -78,7 +74,7 @@ int selinux_string_to_sid(char *str, u32 *sid) } EXPORT_SYMBOL_GPL(selinux_string_to_sid); -int selinux_secmark_relabel_packet_permission(u32 sid) +int selinux_relabel_packet_permission(u32 sid) { if (selinux_enabled) { struct task_security_struct *tsec = current->security; @@ -88,16 +84,4 @@ int selinux_secmark_relabel_packet_permission(u32 sid) } return 0; } -EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission); - -void selinux_secmark_refcount_inc(void) -{ - atomic_inc(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc); - -void selinux_secmark_refcount_dec(void) -{ - atomic_dec(&selinux_secmark_refcount); -} -EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec); +EXPORT_SYMBOL_GPL(selinux_relabel_packet_permission); diff --git a/trunk/security/selinux/hooks.c b/trunk/security/selinux/hooks.c index be6de0b8734f..64d414efb404 100644 --- a/trunk/security/selinux/hooks.c +++ b/trunk/security/selinux/hooks.c @@ -12,8 +12,8 @@ * Copyright (C) 2003 Red Hat, Inc., James Morris * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * - * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. - * Paul Moore + * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. + * Paul Moore, * Copyright (C) 2007 Hitachi Software Engineering Co., Ltd. * Yuichi Nakamura * @@ -50,11 +50,8 @@ #include #include /* for local_port_range[] */ #include /* struct or_callable used in sock_rcv_skb */ -#include -#include #include #include -#include #include #include #include /* for network interface checks */ @@ -79,7 +76,6 @@ #include "avc.h" #include "objsec.h" #include "netif.h" -#include "netnode.h" #include "xfrm.h" #include "netlabel.h" @@ -93,9 +89,6 @@ extern int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm); extern int selinux_compat_net; extern struct security_operations *security_ops; -/* SECMARK reference count */ -atomic_t selinux_secmark_refcount = ATOMIC_INIT(0); - #ifdef CONFIG_SECURITY_SELINUX_DEVELOP int selinux_enforcing = 0; @@ -162,21 +155,6 @@ static int selinux_getsecurity(u32 sid, void *buffer, size_t size) return len; } -/** - * selinux_secmark_enabled - Check to see if SECMARK is currently enabled - * - * Description: - * This function checks the SECMARK reference counter to see if any SECMARK - * targets are currently configured, if the reference counter is greater than - * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is - * enabled, false (0) if SECMARK is disabled. - * - */ -static int selinux_secmark_enabled(void) -{ - return (atomic_read(&selinux_secmark_refcount) > 0); -} - /* Allocate and free functions for each kind of security blob. */ static int task_alloc_security(struct task_struct *task) @@ -583,8 +561,8 @@ static int bad_option(struct superblock_security_struct *sbsec, char flag, * Allow filesystems with binary mount data to explicitly set mount point * labeling information. */ -static int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, - int *flags, int num_opts) +int selinux_set_mnt_opts(struct super_block *sb, char **mount_options, + int *flags, int num_opts) { int rc = 0, i; struct task_security_struct *tsec = current->security; @@ -3417,7 +3395,7 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, #endif /* IPV6 */ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, - char **addrp, int src, u8 *proto) + char **addrp, int *len, int src, u8 *proto) { int ret = 0; @@ -3426,6 +3404,7 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, ret = selinux_parse_skb_ipv4(skb, ad, proto); if (ret || !addrp) break; + *len = 4; *addrp = (char *)(src ? &ad->u.net.v4info.saddr : &ad->u.net.v4info.daddr); break; @@ -3435,6 +3414,7 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, ret = selinux_parse_skb_ipv6(skb, ad, proto); if (ret || !addrp) break; + *len = 16; *addrp = (char *)(src ? &ad->u.net.v6info.saddr : &ad->u.net.v6info.daddr); break; @@ -3443,48 +3423,36 @@ static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad, break; } - if (unlikely(ret)) - printk(KERN_WARNING - "SELinux: failure in selinux_parse_skb()," - " unable to parse packet\n"); - return ret; } /** - * selinux_skb_peerlbl_sid - Determine the peer label of a packet + * selinux_skb_extlbl_sid - Determine the external label of a packet * @skb: the packet - * @family: protocol family - * @sid: the packet's peer label SID + * @sid: the packet's SID * * Description: - * Check the various different forms of network peer labeling and determine - * the peer label/SID for the packet; most of the magic actually occurs in - * the security server function security_net_peersid_cmp(). The function - * returns zero if the value in @sid is valid (although it may be SECSID_NULL) - * or -EACCES if @sid is invalid due to inconsistencies with the different - * peer labels. + * Check the various different forms of external packet labeling and determine + * the external SID for the packet. If only one form of external labeling is + * present then it is used, if both labeled IPsec and NetLabel labels are + * present then the SELinux type information is taken from the labeled IPsec + * SA and the MLS sensitivity label information is taken from the NetLabel + * security attributes. This bit of "magic" is done in the call to + * selinux_netlbl_skbuff_getsid(). * */ -static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) +static void selinux_skb_extlbl_sid(struct sk_buff *skb, u32 *sid) { - int err; u32 xfrm_sid; u32 nlbl_sid; - u32 nlbl_type; selinux_skb_xfrm_sid(skb, &xfrm_sid); - selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); - - err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); - if (unlikely(err)) { - printk(KERN_WARNING - "SELinux: failure in selinux_skb_peerlbl_sid()," - " unable to determine packet's peer label\n"); - return -EACCES; - } - - return 0; + if (selinux_netlbl_skbuff_getsid(skb, + (xfrm_sid == SECSID_NULL ? + SECINITSID_NETMSG : xfrm_sid), + &nlbl_sid) != 0) + nlbl_sid = SECSID_NULL; + *sid = (nlbl_sid == SECSID_NULL ? xfrm_sid : nlbl_sid); } /* socket security operations */ @@ -3550,7 +3518,6 @@ static int selinux_socket_post_create(struct socket *sock, int family, if (sock->sk) { sksec = sock->sk->sk_security; sksec->sid = isec->sid; - sksec->sclass = isec->sclass; err = selinux_netlbl_socket_post_create(sock); } @@ -3643,7 +3610,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in break; } - err = sel_netnode_sid(addrp, family, &sid); + err = security_node_sid(family, addrp, addrlen, &sid); if (err) goto out; @@ -3854,182 +3821,131 @@ static int selinux_socket_unix_may_send(struct socket *sock, return 0; } -static int selinux_inet_sys_rcv_skb(int ifindex, char *addrp, u16 family, - u32 peer_sid, - struct avc_audit_data *ad) +static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, + struct avc_audit_data *ad, u16 family, char *addrp, int len) { - int err; - u32 if_sid; - u32 node_sid; + int err = 0; + u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0; + struct socket *sock; + u16 sock_class = 0; + u32 sock_sid = 0; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (sock) { + struct inode *inode; + inode = SOCK_INODE(sock); + if (inode) { + struct inode_security_struct *isec; + isec = inode->i_security; + sock_sid = isec->sid; + sock_class = isec->sclass; + } + } + read_unlock_bh(&sk->sk_callback_lock); + if (!sock_sid) + goto out; - err = sel_netif_sid(ifindex, &if_sid); - if (err) - return err; - err = avc_has_perm(peer_sid, if_sid, - SECCLASS_NETIF, NETIF__INGRESS, ad); - if (err) - return err; + if (!skb->dev) + goto out; - err = sel_netnode_sid(addrp, family, &node_sid); + err = sel_netif_sids(skb->dev, &if_sid, NULL); if (err) - return err; - return avc_has_perm(peer_sid, node_sid, - SECCLASS_NODE, NODE__RECVFROM, ad); -} - -static int selinux_sock_rcv_skb_iptables_compat(struct sock *sk, - struct sk_buff *skb, - struct avc_audit_data *ad, - u16 family, - char *addrp) -{ - int err; - struct sk_security_struct *sksec = sk->sk_security; - u16 sk_class; - u32 netif_perm, node_perm, recv_perm; - u32 port_sid, node_sid, if_sid, sk_sid; - - sk_sid = sksec->sid; - sk_class = sksec->sclass; + goto out; - switch (sk_class) { + switch (sock_class) { case SECCLASS_UDP_SOCKET: netif_perm = NETIF__UDP_RECV; node_perm = NODE__UDP_RECV; recv_perm = UDP_SOCKET__RECV_MSG; break; + case SECCLASS_TCP_SOCKET: netif_perm = NETIF__TCP_RECV; node_perm = NODE__TCP_RECV; recv_perm = TCP_SOCKET__RECV_MSG; break; + case SECCLASS_DCCP_SOCKET: netif_perm = NETIF__DCCP_RECV; node_perm = NODE__DCCP_RECV; recv_perm = DCCP_SOCKET__RECV_MSG; break; + default: netif_perm = NETIF__RAWIP_RECV; node_perm = NODE__RAWIP_RECV; - recv_perm = 0; break; } - err = sel_netif_sid(skb->iif, &if_sid); - if (err) - return err; - err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); + err = avc_has_perm(sock_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); if (err) - return err; + goto out; - err = sel_netnode_sid(addrp, family, &node_sid); + err = security_node_sid(family, addrp, len, &node_sid); if (err) - return err; - err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); + goto out; + + err = avc_has_perm(sock_sid, node_sid, SECCLASS_NODE, node_perm, ad); if (err) - return err; - - if (!recv_perm) - return 0; - err = security_port_sid(sk->sk_family, sk->sk_type, - sk->sk_protocol, ntohs(ad->u.net.sport), - &port_sid); - if (unlikely(err)) { - printk(KERN_WARNING - "SELinux: failure in" - " selinux_sock_rcv_skb_iptables_compat()," - " network port label not found\n"); - return err; - } - return avc_has_perm(sk_sid, port_sid, sk_class, recv_perm, ad); -} - -static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb, - struct avc_audit_data *ad, - u16 family, char *addrp) -{ - int err; - struct sk_security_struct *sksec = sk->sk_security; - u32 peer_sid; - u32 sk_sid = sksec->sid; + goto out; - if (selinux_compat_net) - err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad, - family, addrp); - else - err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, - PACKET__RECV, ad); - if (err) - return err; + if (recv_perm) { + u32 port_sid; - if (selinux_policycap_netpeer) { - err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); - if (err) - return err; - err = avc_has_perm(sk_sid, peer_sid, - SECCLASS_PEER, PEER__RECV, ad); - } else { - err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad); + err = security_port_sid(sk->sk_family, sk->sk_type, + sk->sk_protocol, ntohs(ad->u.net.sport), + &port_sid); if (err) - return err; - err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad); + goto out; + + err = avc_has_perm(sock_sid, port_sid, + sock_class, recv_perm, ad); } +out: return err; } static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err; - struct sk_security_struct *sksec = sk->sk_security; - u16 family = sk->sk_family; - u32 sk_sid = sksec->sid; - struct avc_audit_data ad; + u16 family; char *addrp; + int len, err = 0; + struct avc_audit_data ad; + struct sk_security_struct *sksec = sk->sk_security; + family = sk->sk_family; if (family != PF_INET && family != PF_INET6) - return 0; + goto out; /* Handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) family = PF_INET; AVC_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.netif = skb->iif; + ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]"; ad.u.net.family = family; - err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL); - if (err) - return err; - - /* If any sort of compatibility mode is enabled then handoff processing - * to the selinux_sock_rcv_skb_compat() function to deal with the - * special handling. We do this in an attempt to keep this function - * as fast and as clean as possible. */ - if (selinux_compat_net || !selinux_policycap_netpeer) - return selinux_sock_rcv_skb_compat(sk, skb, &ad, - family, addrp); - - if (netlbl_enabled() || selinux_xfrm_enabled()) { - u32 peer_sid; - err = selinux_skb_peerlbl_sid(skb, family, &peer_sid); - if (err) - return err; - err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family, - peer_sid, &ad); - if (err) - return err; - err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER, - PEER__RECV, &ad); - } + err = selinux_parse_skb(skb, &ad, &addrp, &len, 1, NULL); + if (err) + goto out; - if (selinux_secmark_enabled()) { - err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET, + if (selinux_compat_net) + err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family, + addrp, len); + else + err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, PACKET__RECV, &ad); - if (err) - return err; - } + if (err) + goto out; + err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad); + if (err) + goto out; + + err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad); +out: return err; } @@ -4080,25 +3996,18 @@ static int selinux_socket_getpeersec_stream(struct socket *sock, char __user *op static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { u32 peer_secid = SECSID_NULL; - u16 family; - - if (sock) - family = sock->sk->sk_family; - else if (skb && skb->sk) - family = skb->sk->sk_family; - else - goto out; + int err = 0; - if (sock && family == PF_UNIX) + if (sock && sock->sk->sk_family == PF_UNIX) selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid); else if (skb) - selinux_skb_peerlbl_sid(skb, family, &peer_secid); + selinux_skb_extlbl_sid(skb, &peer_secid); -out: - *secid = peer_secid; if (peer_secid == SECSID_NULL) - return -EINVAL; - return 0; + err = -EINVAL; + *secid = peer_secid; + + return err; } static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority) @@ -4118,7 +4027,6 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk) newssec->sid = ssec->sid; newssec->peer_sid = ssec->peer_sid; - newssec->sclass = ssec->sclass; selinux_netlbl_sk_security_clone(ssec, newssec); } @@ -4142,7 +4050,6 @@ static void selinux_sock_graft(struct sock* sk, struct socket *parent) if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 || sk->sk_family == PF_UNIX) isec->sid = sksec->sid; - sksec->sclass = isec->sclass; selinux_netlbl_sock_graft(sk, parent); } @@ -4155,9 +4062,7 @@ static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb, u32 newsid; u32 peersid; - err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid); - if (err) - return err; + selinux_skb_extlbl_sid(skb, &peersid); if (peersid == SECSID_NULL) { req->secid = sksec->sid; req->peer_secid = SECSID_NULL; @@ -4195,7 +4100,7 @@ static void selinux_inet_conn_established(struct sock *sk, { struct sk_security_struct *sksec = sk->sk_security; - selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid); + selinux_skb_extlbl_sid(skb, &sksec->peer_sid); } static void selinux_req_classify_flow(const struct request_sock *req, @@ -4242,260 +4147,149 @@ static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_NETFILTER -static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, - u16 family) -{ - char *addrp; - u32 peer_sid; - struct avc_audit_data ad; - u8 secmark_active; - u8 peerlbl_active; - - if (!selinux_policycap_netpeer) - return NF_ACCEPT; - - secmark_active = selinux_secmark_enabled(); - peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); - if (!secmark_active && !peerlbl_active) - return NF_ACCEPT; - - AVC_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.netif = ifindex; - ad.u.net.family = family; - if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0) - return NF_DROP; - - if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0) - return NF_DROP; - - if (peerlbl_active) - if (selinux_inet_sys_rcv_skb(ifindex, addrp, family, - peer_sid, &ad) != 0) - return NF_DROP; - - if (secmark_active) - if (avc_has_perm(peer_sid, skb->secmark, - SECCLASS_PACKET, PACKET__FORWARD_IN, &ad)) - return NF_DROP; - - return NF_ACCEPT; -} - -static unsigned int selinux_ipv4_forward(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev, + struct avc_audit_data *ad, + u16 family, char *addrp, int len) { - return selinux_ip_forward(skb, in->ifindex, PF_INET); -} + int err = 0; + u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0; + struct socket *sock; + struct inode *inode; + struct inode_security_struct *isec; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static unsigned int selinux_ipv6_forward(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return selinux_ip_forward(skb, in->ifindex, PF_INET6); -} -#endif /* IPV6 */ + sock = sk->sk_socket; + if (!sock) + goto out; -static int selinux_ip_postroute_iptables_compat(struct sock *sk, - int ifindex, - struct avc_audit_data *ad, - u16 family, char *addrp) -{ - int err; - struct sk_security_struct *sksec = sk->sk_security; - u16 sk_class; - u32 netif_perm, node_perm, send_perm; - u32 port_sid, node_sid, if_sid, sk_sid; + inode = SOCK_INODE(sock); + if (!inode) + goto out; - sk_sid = sksec->sid; - sk_class = sksec->sclass; + isec = inode->i_security; + + err = sel_netif_sids(dev, &if_sid, NULL); + if (err) + goto out; - switch (sk_class) { + switch (isec->sclass) { case SECCLASS_UDP_SOCKET: netif_perm = NETIF__UDP_SEND; node_perm = NODE__UDP_SEND; send_perm = UDP_SOCKET__SEND_MSG; break; + case SECCLASS_TCP_SOCKET: netif_perm = NETIF__TCP_SEND; node_perm = NODE__TCP_SEND; send_perm = TCP_SOCKET__SEND_MSG; break; + case SECCLASS_DCCP_SOCKET: netif_perm = NETIF__DCCP_SEND; node_perm = NODE__DCCP_SEND; send_perm = DCCP_SOCKET__SEND_MSG; break; + default: netif_perm = NETIF__RAWIP_SEND; node_perm = NODE__RAWIP_SEND; - send_perm = 0; break; } - err = sel_netif_sid(ifindex, &if_sid); + err = avc_has_perm(isec->sid, if_sid, SECCLASS_NETIF, netif_perm, ad); if (err) - return err; - err = avc_has_perm(sk_sid, if_sid, SECCLASS_NETIF, netif_perm, ad); - return err; + goto out; - err = sel_netnode_sid(addrp, family, &node_sid); + err = security_node_sid(family, addrp, len, &node_sid); if (err) - return err; - err = avc_has_perm(sk_sid, node_sid, SECCLASS_NODE, node_perm, ad); + goto out; + + err = avc_has_perm(isec->sid, node_sid, SECCLASS_NODE, node_perm, ad); if (err) - return err; - - if (send_perm != 0) - return 0; - - err = security_port_sid(sk->sk_family, sk->sk_type, - sk->sk_protocol, ntohs(ad->u.net.dport), - &port_sid); - if (unlikely(err)) { - printk(KERN_WARNING - "SELinux: failure in" - " selinux_ip_postroute_iptables_compat()," - " network port label not found\n"); - return err; - } - return avc_has_perm(sk_sid, port_sid, sk_class, send_perm, ad); -} - -static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb, - int ifindex, - struct avc_audit_data *ad, - u16 family, - char *addrp, - u8 proto) -{ - struct sock *sk = skb->sk; - struct sk_security_struct *sksec; + goto out; - if (sk == NULL) - return NF_ACCEPT; - sksec = sk->sk_security; + if (send_perm) { + u32 port_sid; + + err = security_port_sid(sk->sk_family, + sk->sk_type, + sk->sk_protocol, + ntohs(ad->u.net.dport), + &port_sid); + if (err) + goto out; - if (selinux_compat_net) { - if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex, - ad, family, addrp)) - return NF_DROP; - } else { - if (avc_has_perm(sksec->sid, skb->secmark, - SECCLASS_PACKET, PACKET__SEND, ad)) - return NF_DROP; + err = avc_has_perm(isec->sid, port_sid, isec->sclass, + send_perm, ad); } - - if (selinux_policycap_netpeer) - if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto)) - return NF_DROP; - - return NF_ACCEPT; +out: + return err; } -static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, - u16 family) +static unsigned int selinux_ip_postroute_last(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *), + u16 family) { - u32 secmark_perm; - u32 peer_sid; + char *addrp; + int len, err = 0; struct sock *sk; struct avc_audit_data ad; - char *addrp; + struct net_device *dev = (struct net_device *)out; + struct sk_security_struct *sksec; u8 proto; - u8 secmark_active; - u8 peerlbl_active; + + sk = skb->sk; + if (!sk) + goto out; + + sksec = sk->sk_security; AVC_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.netif = ifindex; + ad.u.net.netif = dev->name; ad.u.net.family = family; - if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto)) - return NF_DROP; - - /* If any sort of compatibility mode is enabled then handoff processing - * to the selinux_ip_postroute_compat() function to deal with the - * special handling. We do this in an attempt to keep this function - * as fast and as clean as possible. */ - if (selinux_compat_net || !selinux_policycap_netpeer) - return selinux_ip_postroute_compat(skb, ifindex, &ad, - family, addrp, proto); - - /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec - * packet transformation so allow the packet to pass without any checks - * since we'll have another chance to perform access control checks - * when the packet is on it's final way out. - * NOTE: there appear to be some IPv6 multicast cases where skb->dst - * is NULL, in this case go ahead and apply access control. */ - if (skb->dst != NULL && skb->dst->xfrm != NULL) - return NF_ACCEPT; - - secmark_active = selinux_secmark_enabled(); - peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); - if (!secmark_active && !peerlbl_active) - return NF_ACCEPT; - - /* if the packet is locally generated (skb->sk != NULL) then use the - * socket's label as the peer label, otherwise the packet is being - * forwarded through this system and we need to fetch the peer label - * directly from the packet */ - sk = skb->sk; - if (sk) { - struct sk_security_struct *sksec = sk->sk_security; - peer_sid = sksec->sid; - secmark_perm = PACKET__SEND; - } else { - if (selinux_skb_peerlbl_sid(skb, family, &peer_sid)) - return NF_DROP; - secmark_perm = PACKET__FORWARD_OUT; - } - if (secmark_active) - if (avc_has_perm(peer_sid, skb->secmark, - SECCLASS_PACKET, secmark_perm, &ad)) - return NF_DROP; - - if (peerlbl_active) { - u32 if_sid; - u32 node_sid; - - if (sel_netif_sid(ifindex, &if_sid)) - return NF_DROP; - if (avc_has_perm(peer_sid, if_sid, - SECCLASS_NETIF, NETIF__EGRESS, &ad)) - return NF_DROP; - - if (sel_netnode_sid(addrp, family, &node_sid)) - return NF_DROP; - if (avc_has_perm(peer_sid, node_sid, - SECCLASS_NODE, NODE__SENDTO, &ad)) - return NF_DROP; - } + err = selinux_parse_skb(skb, &ad, &addrp, &len, 0, &proto); + if (err) + goto out; - return NF_ACCEPT; + if (selinux_compat_net) + err = selinux_ip_postroute_last_compat(sk, dev, &ad, + family, addrp, len); + else + err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET, + PACKET__SEND, &ad); + + if (err) + goto out; + + err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto); +out: + return err ? NF_DROP : NF_ACCEPT; } -static unsigned int selinux_ipv4_postroute(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int selinux_ipv4_postroute_last(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return selinux_ip_postroute(skb, out->ifindex, PF_INET); + return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static unsigned int selinux_ipv6_postroute(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + +static unsigned int selinux_ipv6_postroute_last(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return selinux_ip_postroute(skb, out->ifindex, PF_INET6); + return selinux_ip_postroute_last(hooknum, skb, in, out, okfn, PF_INET6); } + #endif /* IPV6 */ #endif /* CONFIG_NETFILTER */ @@ -5483,40 +5277,22 @@ security_initcall(selinux_init); #if defined(CONFIG_NETFILTER) -static struct nf_hook_ops selinux_ipv4_ops[] = { - { - .hook = selinux_ipv4_postroute, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_SELINUX_LAST, - }, - { - .hook = selinux_ipv4_forward, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_SELINUX_FIRST, - } +static struct nf_hook_ops selinux_ipv4_op = { + .hook = selinux_ipv4_postroute_last, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_SELINUX_LAST, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct nf_hook_ops selinux_ipv6_ops[] = { - { - .hook = selinux_ipv6_postroute, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_SELINUX_LAST, - }, - { - .hook = selinux_ipv6_forward, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP6_PRI_SELINUX_FIRST, - } +static struct nf_hook_ops selinux_ipv6_op = { + .hook = selinux_ipv6_postroute_last, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_SELINUX_LAST, }; #endif /* IPV6 */ @@ -5524,27 +5300,22 @@ static struct nf_hook_ops selinux_ipv6_ops[] = { static int __init selinux_nf_ip_init(void) { int err = 0; - u32 iter; if (!selinux_enabled) goto out; printk(KERN_DEBUG "SELinux: Registering netfilter hooks\n"); - for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) { - err = nf_register_hook(&selinux_ipv4_ops[iter]); - if (err) - panic("SELinux: nf_register_hook for IPv4: error %d\n", - err); - } + err = nf_register_hook(&selinux_ipv4_op); + if (err) + panic("SELinux: nf_register_hook for IPv4: error %d\n", err); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) { - err = nf_register_hook(&selinux_ipv6_ops[iter]); - if (err) - panic("SELinux: nf_register_hook for IPv6: error %d\n", - err); - } + + err = nf_register_hook(&selinux_ipv6_op); + if (err) + panic("SELinux: nf_register_hook for IPv6: error %d\n", err); + #endif /* IPV6 */ out: @@ -5556,15 +5327,11 @@ __initcall(selinux_nf_ip_init); #ifdef CONFIG_SECURITY_SELINUX_DISABLE static void selinux_nf_ip_exit(void) { - u32 iter; - printk(KERN_DEBUG "SELinux: Unregistering netfilter hooks\n"); - for (iter = 0; iter < ARRAY_SIZE(selinux_ipv4_ops); iter++) - nf_unregister_hook(&selinux_ipv4_ops[iter]); + nf_unregister_hook(&selinux_ipv4_op); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - for (iter = 0; iter < ARRAY_SIZE(selinux_ipv6_ops); iter++) - nf_unregister_hook(&selinux_ipv6_ops[iter]); + nf_unregister_hook(&selinux_ipv6_op); #endif /* IPV6 */ } #endif diff --git a/trunk/security/selinux/include/av_perm_to_string.h b/trunk/security/selinux/include/av_perm_to_string.h index 399f868c5c8f..049bf69429b6 100644 --- a/trunk/security/selinux/include/av_perm_to_string.h +++ b/trunk/security/selinux/include/av_perm_to_string.h @@ -37,8 +37,6 @@ S_(SECCLASS_NODE, NODE__ENFORCE_DEST, "enforce_dest") S_(SECCLASS_NODE, NODE__DCCP_RECV, "dccp_recv") S_(SECCLASS_NODE, NODE__DCCP_SEND, "dccp_send") - S_(SECCLASS_NODE, NODE__RECVFROM, "recvfrom") - S_(SECCLASS_NODE, NODE__SENDTO, "sendto") S_(SECCLASS_NETIF, NETIF__TCP_RECV, "tcp_recv") S_(SECCLASS_NETIF, NETIF__TCP_SEND, "tcp_send") S_(SECCLASS_NETIF, NETIF__UDP_RECV, "udp_recv") @@ -47,8 +45,6 @@ S_(SECCLASS_NETIF, NETIF__RAWIP_SEND, "rawip_send") S_(SECCLASS_NETIF, NETIF__DCCP_RECV, "dccp_recv") S_(SECCLASS_NETIF, NETIF__DCCP_SEND, "dccp_send") - S_(SECCLASS_NETIF, NETIF__INGRESS, "ingress") - S_(SECCLASS_NETIF, NETIF__EGRESS, "egress") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__CONNECTTO, "connectto") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__NEWCONN, "newconn") S_(SECCLASS_UNIX_STREAM_SOCKET, UNIX_STREAM_SOCKET__ACCEPTFROM, "acceptfrom") @@ -153,10 +149,6 @@ S_(SECCLASS_PACKET, PACKET__SEND, "send") S_(SECCLASS_PACKET, PACKET__RECV, "recv") S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") - S_(SECCLASS_PACKET, PACKET__FLOW_IN, "flow_in") - S_(SECCLASS_PACKET, PACKET__FLOW_OUT, "flow_out") - S_(SECCLASS_PACKET, PACKET__FORWARD_IN, "forward_in") - S_(SECCLASS_PACKET, PACKET__FORWARD_OUT, "forward_out") S_(SECCLASS_KEY, KEY__VIEW, "view") S_(SECCLASS_KEY, KEY__READ, "read") S_(SECCLASS_KEY, KEY__WRITE, "write") @@ -167,4 +159,3 @@ S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NODE_BIND, "node_bind") S_(SECCLASS_DCCP_SOCKET, DCCP_SOCKET__NAME_CONNECT, "name_connect") S_(SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, "mmap_zero") - S_(SECCLASS_PEER, PEER__RECV, "recv") diff --git a/trunk/security/selinux/include/av_permissions.h b/trunk/security/selinux/include/av_permissions.h index 84c9abc80978..eda89a2ec635 100644 --- a/trunk/security/selinux/include/av_permissions.h +++ b/trunk/security/selinux/include/av_permissions.h @@ -292,8 +292,6 @@ #define NODE__ENFORCE_DEST 0x00000040UL #define NODE__DCCP_RECV 0x00000080UL #define NODE__DCCP_SEND 0x00000100UL -#define NODE__RECVFROM 0x00000200UL -#define NODE__SENDTO 0x00000400UL #define NETIF__TCP_RECV 0x00000001UL #define NETIF__TCP_SEND 0x00000002UL #define NETIF__UDP_RECV 0x00000004UL @@ -302,8 +300,6 @@ #define NETIF__RAWIP_SEND 0x00000020UL #define NETIF__DCCP_RECV 0x00000040UL #define NETIF__DCCP_SEND 0x00000080UL -#define NETIF__INGRESS 0x00000100UL -#define NETIF__EGRESS 0x00000200UL #define NETLINK_SOCKET__IOCTL 0x00000001UL #define NETLINK_SOCKET__READ 0x00000002UL #define NETLINK_SOCKET__WRITE 0x00000004UL @@ -796,10 +792,6 @@ #define PACKET__SEND 0x00000001UL #define PACKET__RECV 0x00000002UL #define PACKET__RELABELTO 0x00000004UL -#define PACKET__FLOW_IN 0x00000008UL -#define PACKET__FLOW_OUT 0x00000010UL -#define PACKET__FORWARD_IN 0x00000020UL -#define PACKET__FORWARD_OUT 0x00000040UL #define KEY__VIEW 0x00000001UL #define KEY__READ 0x00000002UL #define KEY__WRITE 0x00000004UL @@ -832,4 +824,3 @@ #define DCCP_SOCKET__NODE_BIND 0x00400000UL #define DCCP_SOCKET__NAME_CONNECT 0x00800000UL #define MEMPROTECT__MMAP_ZERO 0x00000001UL -#define PEER__RECV 0x00000001UL diff --git a/trunk/security/selinux/include/avc.h b/trunk/security/selinux/include/avc.h index 80c28fa6621c..553607a19e92 100644 --- a/trunk/security/selinux/include/avc.h +++ b/trunk/security/selinux/include/avc.h @@ -51,7 +51,7 @@ struct avc_audit_data { struct inode *inode; } fs; struct { - int netif; + char *netif; struct sock *sk; u16 family; __be16 dport; diff --git a/trunk/security/selinux/include/class_to_string.h b/trunk/security/selinux/include/class_to_string.h index b1b0d1d8f950..e77de0e62ea0 100644 --- a/trunk/security/selinux/include/class_to_string.h +++ b/trunk/security/selinux/include/class_to_string.h @@ -64,10 +64,3 @@ S_(NULL) S_("dccp_socket") S_("memprotect") - S_(NULL) - S_(NULL) - S_(NULL) - S_(NULL) - S_(NULL) - S_(NULL) - S_("peer") diff --git a/trunk/security/selinux/include/flask.h b/trunk/security/selinux/include/flask.h index 09e9dd23ee1a..a9c2b20f14b5 100644 --- a/trunk/security/selinux/include/flask.h +++ b/trunk/security/selinux/include/flask.h @@ -50,7 +50,6 @@ #define SECCLASS_KEY 58 #define SECCLASS_DCCP_SOCKET 60 #define SECCLASS_MEMPROTECT 61 -#define SECCLASS_PEER 68 /* * Security identifier indices for initial entities diff --git a/trunk/security/selinux/include/netif.h b/trunk/security/selinux/include/netif.h index ce23edd128b3..8bd6f9992d2b 100644 --- a/trunk/security/selinux/include/netif.h +++ b/trunk/security/selinux/include/netif.h @@ -7,8 +7,6 @@ * Author: James Morris * * Copyright (C) 2003 Red Hat, Inc., James Morris - * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. - * Paul Moore, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -17,7 +15,7 @@ #ifndef _SELINUX_NETIF_H_ #define _SELINUX_NETIF_H_ -int sel_netif_sid(int ifindex, u32 *sid); +int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid); #endif /* _SELINUX_NETIF_H_ */ diff --git a/trunk/security/selinux/include/netlabel.h b/trunk/security/selinux/include/netlabel.h index 00a2809c8506..218e3f77c350 100644 --- a/trunk/security/selinux/include/netlabel.h +++ b/trunk/security/selinux/include/netlabel.h @@ -46,17 +46,13 @@ void selinux_netlbl_sk_security_init(struct sk_security_struct *ssec, void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, struct sk_security_struct *newssec); -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, - u16 family, - u32 *type, - u32 *sid); +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid); void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock); int selinux_netlbl_socket_post_create(struct socket *sock); int selinux_netlbl_inode_permission(struct inode *inode, int mask); int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, - u16 family, struct avc_audit_data *ad); int selinux_netlbl_socket_setsockopt(struct socket *sock, int level, @@ -87,11 +83,9 @@ static inline void selinux_netlbl_sk_security_clone( } static inline int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, - u16 family, - u32 *type, + u32 base_sid, u32 *sid) { - *type = NETLBL_NLTYPE_NONE; *sid = SECSID_NULL; return 0; } @@ -112,7 +106,6 @@ static inline int selinux_netlbl_inode_permission(struct inode *inode, } static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, - u16 family, struct avc_audit_data *ad) { return 0; diff --git a/trunk/security/selinux/include/netnode.h b/trunk/security/selinux/include/netnode.h deleted file mode 100644 index 1b94450d11d2..000000000000 --- a/trunk/security/selinux/include/netnode.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Network node table - * - * SELinux must keep a mapping of network nodes to labels/SIDs. This - * mapping is maintained as part of the normal policy but a fast cache is - * needed to reduce the lookup overhead since most of these queries happen on - * a per-packet basis. - * - * Author: Paul Moore - * - */ - -/* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _SELINUX_NETNODE_H -#define _SELINUX_NETNODE_H - -int sel_netnode_sid(void *addr, u16 family, u32 *sid); - -#endif diff --git a/trunk/security/selinux/include/objsec.h b/trunk/security/selinux/include/objsec.h index c6c2bb4ebacc..4138a80f8e27 100644 --- a/trunk/security/selinux/include/objsec.h +++ b/trunk/security/selinux/include/objsec.h @@ -96,25 +96,17 @@ struct bprm_security_struct { }; struct netif_security_struct { - int ifindex; /* device index */ - u32 sid; /* SID for this interface */ -}; - -struct netnode_security_struct { - union { - __be32 ipv4; /* IPv4 node address */ - struct in6_addr ipv6; /* IPv6 node address */ - } addr; - u32 sid; /* SID for this node */ - u16 family; /* address family */ + struct net_device *dev; /* back pointer */ + u32 if_sid; /* SID for this interface */ + u32 msg_sid; /* default SID for messages received on this interface */ }; struct sk_security_struct { struct sock *sk; /* back pointer to sk object */ u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ - u16 sclass; /* sock security class */ #ifdef CONFIG_NETLABEL + u16 sclass; /* sock security class */ enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, diff --git a/trunk/security/selinux/include/security.h b/trunk/security/selinux/include/security.h index 23137c17f917..39337afffec2 100644 --- a/trunk/security/selinux/include/security.h +++ b/trunk/security/selinux/include/security.h @@ -25,14 +25,13 @@ #define POLICYDB_VERSION_MLS 19 #define POLICYDB_VERSION_AVTAB 20 #define POLICYDB_VERSION_RANGETRANS 21 -#define POLICYDB_VERSION_POLCAP 22 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX #define POLICYDB_VERSION_MAX CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE #else -#define POLICYDB_VERSION_MAX POLICYDB_VERSION_POLCAP +#define POLICYDB_VERSION_MAX POLICYDB_VERSION_RANGETRANS #endif struct netlbl_lsm_secattr; @@ -40,19 +39,8 @@ struct netlbl_lsm_secattr; extern int selinux_enabled; extern int selinux_mls_enabled; -/* Policy capabilities */ -enum { - POLICYDB_CAPABILITY_NETPEER, - __POLICYDB_CAPABILITY_MAX -}; -#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) - -extern int selinux_policycap_netpeer; - int security_load_policy(void * data, size_t len); -int security_policycap_supported(unsigned int req_cap); - #define SEL_VEC_MAX 32 struct av_decision { u32 allowed; @@ -89,7 +77,8 @@ int security_get_user_sids(u32 callsid, char *username, int security_port_sid(u16 domain, u16 type, u8 protocol, u16 port, u32 *out_sid); -int security_netif_sid(char *name, u32 *if_sid); +int security_netif_sid(char *name, u32 *if_sid, + u32 *msg_sid); int security_node_sid(u16 domain, void *addr, u32 addrlen, u32 *out_sid); @@ -99,15 +88,10 @@ int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid); -int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, - u32 xfrm_sid, - u32 *peer_sid); - int security_get_classes(char ***classes, int *nclasses); int security_get_permissions(char *class, char ***perms, int *nperms); int security_get_reject_unknown(void); int security_get_allow_unknown(void); -int security_get_policycaps(int *len, int **values); #define SECURITY_FS_USE_XATTR 1 /* use xattr */ #define SECURITY_FS_USE_TRANS 2 /* use transition SIDs, e.g. devpts/tmpfs */ @@ -124,6 +108,7 @@ int security_genfs_sid(const char *fstype, char *name, u16 sclass, #ifdef CONFIG_NETLABEL int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, + u32 base_sid, u32 *sid); int security_netlbl_sid_to_secattr(u32 sid, @@ -131,6 +116,7 @@ int security_netlbl_sid_to_secattr(u32 sid, #else static inline int security_netlbl_secattr_to_sid( struct netlbl_lsm_secattr *secattr, + u32 base_sid, u32 *sid) { return -EIDRM; diff --git a/trunk/security/selinux/include/xfrm.h b/trunk/security/selinux/include/xfrm.h index 36b0510efa7b..31929e39f5ca 100644 --- a/trunk/security/selinux/include/xfrm.h +++ b/trunk/security/selinux/include/xfrm.h @@ -32,13 +32,6 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk) } #ifdef CONFIG_SECURITY_NETWORK_XFRM -extern atomic_t selinux_xfrm_refcount; - -static inline int selinux_xfrm_enabled(void) -{ - return (atomic_read(&selinux_xfrm_refcount) > 0); -} - int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, struct avc_audit_data *ad); int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, @@ -50,11 +43,6 @@ static inline void selinux_xfrm_notify_policyload(void) atomic_inc(&flow_cache_genid); } #else -static inline int selinux_xfrm_enabled(void) -{ - return 0; -} - static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, struct avc_audit_data *ad) { diff --git a/trunk/security/selinux/netif.c b/trunk/security/selinux/netif.c index 013d3117a86b..e87ab948104c 100644 --- a/trunk/security/selinux/netif.c +++ b/trunk/security/selinux/netif.c @@ -7,8 +7,6 @@ * Author: James Morris * * Copyright (C) 2003 Red Hat, Inc., James Morris - * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. - * Paul Moore * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -31,6 +29,14 @@ #define SEL_NETIF_HASH_SIZE 64 #define SEL_NETIF_HASH_MAX 1024 +#undef DEBUG + +#ifdef DEBUG +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + struct sel_netif { struct list_head list; @@ -43,226 +49,174 @@ static LIST_HEAD(sel_netif_list); static DEFINE_SPINLOCK(sel_netif_lock); static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE]; -/** - * sel_netif_hashfn - Hashing function for the interface table - * @ifindex: the network interface - * - * Description: - * This is the hashing function for the network interface table, it returns the - * bucket number for the given interface. - * - */ -static inline u32 sel_netif_hashfn(int ifindex) +static inline u32 sel_netif_hasfn(struct net_device *dev) { - return (ifindex & (SEL_NETIF_HASH_SIZE - 1)); + return (dev->ifindex & (SEL_NETIF_HASH_SIZE - 1)); } -/** - * sel_netif_find - Search for an interface record - * @ifindex: the network interface - * - * Description: - * Search the network interface table and return the record matching @ifindex. - * If an entry can not be found in the table return NULL. - * +/* + * All of the devices should normally fit in the hash, so we optimize + * for that case. */ -static inline struct sel_netif *sel_netif_find(int ifindex) +static inline struct sel_netif *sel_netif_find(struct net_device *dev) { - int idx = sel_netif_hashfn(ifindex); - struct sel_netif *netif; + struct list_head *pos; + int idx = sel_netif_hasfn(dev); - list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list) - /* all of the devices should normally fit in the hash, so we - * optimize for that case */ - if (likely(netif->nsec.ifindex == ifindex)) + __list_for_each_rcu(pos, &sel_netif_hash[idx]) { + struct sel_netif *netif = list_entry(pos, + struct sel_netif, list); + if (likely(netif->nsec.dev == dev)) return netif; - + } return NULL; } -/** - * sel_netif_insert - Insert a new interface into the table - * @netif: the new interface record - * - * Description: - * Add a new interface record to the network interface hash table. Returns - * zero on success, negative values on failure. - * - */ static int sel_netif_insert(struct sel_netif *netif) { - int idx; + int idx, ret = 0; - if (sel_netif_total >= SEL_NETIF_HASH_MAX) - return -ENOSPC; + if (sel_netif_total >= SEL_NETIF_HASH_MAX) { + ret = -ENOSPC; + goto out; + } - idx = sel_netif_hashfn(netif->nsec.ifindex); + idx = sel_netif_hasfn(netif->nsec.dev); list_add_rcu(&netif->list, &sel_netif_hash[idx]); sel_netif_total++; - - return 0; +out: + return ret; } -/** - * sel_netif_free - Frees an interface entry - * @p: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table interface entry can be - * released safely. - * - */ static void sel_netif_free(struct rcu_head *p) { struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head); + + DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); kfree(netif); } -/** - * sel_netif_destroy - Remove an interface record from the table - * @netif: the existing interface record - * - * Description: - * Remove an existing interface record from the network interface table. - * - */ static void sel_netif_destroy(struct sel_netif *netif) { + DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name); + list_del_rcu(&netif->list); sel_netif_total--; call_rcu(&netif->rcu_head, sel_netif_free); } -/** - * sel_netif_sid_slow - Lookup the SID of a network interface using the policy - * @ifindex: the network interface - * @sid: interface SID - * - * Description: - * This function determines the SID of a network interface by quering the - * security policy. The result is added to the network interface table to - * speedup future queries. Returns zero on success, negative values on - * failure. - * - */ -static int sel_netif_sid_slow(int ifindex, u32 *sid) +static struct sel_netif *sel_netif_lookup(struct net_device *dev) { int ret; - struct sel_netif *netif; - struct sel_netif *new = NULL; - struct net_device *dev; - - /* NOTE: we always use init's network namespace since we don't - * currently support containers */ - - dev = dev_get_by_index(&init_net, ifindex); - if (unlikely(dev == NULL)) { - printk(KERN_WARNING - "SELinux: failure in sel_netif_sid_slow()," - " invalid network interface (%d)\n", ifindex); - return -ENOENT; - } + struct sel_netif *netif, *new; + struct netif_security_struct *nsec; - spin_lock_bh(&sel_netif_lock); - netif = sel_netif_find(ifindex); - if (netif != NULL) { - *sid = netif->nsec.sid; - ret = 0; + netif = sel_netif_find(dev); + if (likely(netif != NULL)) goto out; - } + new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) { - ret = -ENOMEM; + if (!new) { + netif = ERR_PTR(-ENOMEM); goto out; } - ret = security_netif_sid(dev->name, &new->nsec.sid); - if (ret != 0) - goto out; - new->nsec.ifindex = ifindex; - ret = sel_netif_insert(new); - if (ret != 0) + + nsec = &new->nsec; + + ret = security_netif_sid(dev->name, &nsec->if_sid, &nsec->msg_sid); + if (ret < 0) { + kfree(new); + netif = ERR_PTR(ret); goto out; - *sid = new->nsec.sid; + } -out: + nsec->dev = dev; + + spin_lock_bh(&sel_netif_lock); + + netif = sel_netif_find(dev); + if (netif) { + spin_unlock_bh(&sel_netif_lock); + kfree(new); + goto out; + } + + ret = sel_netif_insert(new); spin_unlock_bh(&sel_netif_lock); - dev_put(dev); - if (unlikely(ret)) { - printk(KERN_WARNING - "SELinux: failure in sel_netif_sid_slow()," - " unable to determine network interface label (%d)\n", - ifindex); + + if (ret) { kfree(new); + netif = ERR_PTR(ret); + goto out; } + + netif = new; + + DEBUGP("new: ifindex=%u name=%s if_sid=%u msg_sid=%u\n", dev->ifindex, dev->name, + nsec->if_sid, nsec->msg_sid); +out: + return netif; +} + +static void sel_netif_assign_sids(u32 if_sid_in, u32 msg_sid_in, u32 *if_sid_out, u32 *msg_sid_out) +{ + if (if_sid_out) + *if_sid_out = if_sid_in; + if (msg_sid_out) + *msg_sid_out = msg_sid_in; +} + +static int sel_netif_sids_slow(struct net_device *dev, u32 *if_sid, u32 *msg_sid) +{ + int ret = 0; + u32 tmp_if_sid, tmp_msg_sid; + + ret = security_netif_sid(dev->name, &tmp_if_sid, &tmp_msg_sid); + if (!ret) + sel_netif_assign_sids(tmp_if_sid, tmp_msg_sid, if_sid, msg_sid); return ret; } -/** - * sel_netif_sid - Lookup the SID of a network interface - * @ifindex: the network interface - * @sid: interface SID - * - * Description: - * This function determines the SID of a network interface using the fastest - * method possible. First the interface table is queried, but if an entry - * can't be found then the policy is queried and the result is added to the - * table to speedup future queries. Returns zero on success, negative values - * on failure. - * - */ -int sel_netif_sid(int ifindex, u32 *sid) +int sel_netif_sids(struct net_device *dev, u32 *if_sid, u32 *msg_sid) { + int ret = 0; struct sel_netif *netif; rcu_read_lock(); - netif = sel_netif_find(ifindex); - if (likely(netif != NULL)) { - *sid = netif->nsec.sid; + netif = sel_netif_lookup(dev); + if (IS_ERR(netif)) { rcu_read_unlock(); - return 0; + ret = sel_netif_sids_slow(dev, if_sid, msg_sid); + goto out; } + sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid); rcu_read_unlock(); - - return sel_netif_sid_slow(ifindex, sid); +out: + return ret; } -/** - * sel_netif_kill - Remove an entry from the network interface table - * @ifindex: the network interface - * - * Description: - * This function removes the entry matching @ifindex from the network interface - * table if it exists. - * - */ -static void sel_netif_kill(int ifindex) +static void sel_netif_kill(struct net_device *dev) { struct sel_netif *netif; spin_lock_bh(&sel_netif_lock); - netif = sel_netif_find(ifindex); + netif = sel_netif_find(dev); if (netif) sel_netif_destroy(netif); spin_unlock_bh(&sel_netif_lock); } -/** - * sel_netif_flush - Flush the entire network interface table - * - * Description: - * Remove all entries from the network interface table. - * - */ static void sel_netif_flush(void) { int idx; - struct sel_netif *netif; spin_lock_bh(&sel_netif_lock); - for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) + for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++) { + struct sel_netif *netif; + list_for_each_entry(netif, &sel_netif_hash[idx], list) sel_netif_destroy(netif); + } spin_unlock_bh(&sel_netif_lock); } @@ -285,7 +239,7 @@ static int sel_netif_netdev_notifier_handler(struct notifier_block *this, return NOTIFY_DONE; if (event == NETDEV_DOWN) - sel_netif_kill(dev->ifindex); + sel_netif_kill(dev); return NOTIFY_DONE; } @@ -296,10 +250,10 @@ static struct notifier_block sel_netif_netdev_notifier = { static __init int sel_netif_init(void) { - int i, err; + int i, err = 0; if (!selinux_enabled) - return 0; + goto out; for (i = 0; i < SEL_NETIF_HASH_SIZE; i++) INIT_LIST_HEAD(&sel_netif_hash[i]); @@ -311,6 +265,7 @@ static __init int sel_netif_init(void) if (err) panic("avc_add_callback() failed, error %d\n", err); +out: return err; } diff --git a/trunk/security/selinux/netlabel.c b/trunk/security/selinux/netlabel.c index 0fa2be4149e8..66e013d6f6f6 100644 --- a/trunk/security/selinux/netlabel.c +++ b/trunk/security/selinux/netlabel.c @@ -35,33 +35,6 @@ #include "objsec.h" #include "security.h" -/** - * selinux_netlbl_sidlookup_cached - Cache a SID lookup - * @skb: the packet - * @secattr: the NetLabel security attributes - * @sid: the SID - * - * Description: - * Query the SELinux security server to lookup the correct SID for the given - * security attributes. If the query is successful, cache the result to speed - * up future lookups. Returns zero on success, negative values on failure. - * - */ -static int selinux_netlbl_sidlookup_cached(struct sk_buff *skb, - struct netlbl_lsm_secattr *secattr, - u32 *sid) -{ - int rc; - - rc = security_netlbl_secattr_to_sid(secattr, sid); - if (rc == 0 && - (secattr->flags & NETLBL_SECATTR_CACHEABLE) && - (secattr->flags & NETLBL_SECATTR_CACHE)) - netlbl_cache_add(skb, secattr); - - return rc; -} - /** * selinux_netlbl_sock_setsid - Label a socket using the NetLabel mechanism * @sk: the socket to label @@ -164,14 +137,14 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, * lock as other threads could have access to ssec */ rcu_read_lock(); selinux_netlbl_sk_security_reset(newssec, ssec->sk->sk_family); + newssec->sclass = ssec->sclass; rcu_read_unlock(); } /** * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel * @skb: the packet - * @family: protocol family - * @type: NetLabel labeling protocol type + * @base_sid: the SELinux SID to use as a context for MLS only attributes * @sid: the SID * * Description: @@ -180,10 +153,7 @@ void selinux_netlbl_sk_security_clone(struct sk_security_struct *ssec, * assign to the packet. Returns zero on success, negative values on failure. * */ -int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, - u16 family, - u32 *type, - u32 *sid) +int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, u32 base_sid, u32 *sid) { int rc; struct netlbl_lsm_secattr secattr; @@ -194,12 +164,15 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, } netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, family, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = selinux_netlbl_sidlookup_cached(skb, &secattr, sid); - else + rc = netlbl_skbuff_getattr(skb, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { + rc = security_netlbl_secattr_to_sid(&secattr, base_sid, sid); + if (rc == 0 && + (secattr.flags & NETLBL_SECATTR_CACHEABLE) && + (secattr.flags & NETLBL_SECATTR_CACHE)) + netlbl_cache_add(skb, &secattr); + } else *sid = SECSID_NULL; - *type = secattr.type; netlbl_secattr_destroy(&secattr); return rc; @@ -217,10 +190,13 @@ int selinux_netlbl_skbuff_getsid(struct sk_buff *skb, */ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) { + struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; struct sk_security_struct *sksec = sk->sk_security; struct netlbl_lsm_secattr secattr; u32 nlbl_peer_sid; + sksec->sclass = isec->sclass; + rcu_read_lock(); if (sksec->nlbl_state != NLBL_REQUIRE) { @@ -231,7 +207,9 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock) netlbl_secattr_init(&secattr); if (netlbl_sock_getattr(sk, &secattr) == 0 && secattr.flags != NETLBL_SECATTR_NONE && - security_netlbl_secattr_to_sid(&secattr, &nlbl_peer_sid) == 0) + security_netlbl_secattr_to_sid(&secattr, + SECINITSID_NETMSG, + &nlbl_peer_sid) == 0) sksec->peer_sid = nlbl_peer_sid; netlbl_secattr_destroy(&secattr); @@ -256,8 +234,11 @@ int selinux_netlbl_socket_post_create(struct socket *sock) { int rc = 0; struct sock *sk = sock->sk; + struct inode_security_struct *isec = SOCK_INODE(sock)->i_security; struct sk_security_struct *sksec = sk->sk_security; + sksec->sclass = isec->sclass; + rcu_read_lock(); if (sksec->nlbl_state == NLBL_REQUIRE) rc = selinux_netlbl_sock_setsid(sk, sksec->sid); @@ -311,7 +292,6 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel * @sksec: the sock's sk_security_struct * @skb: the packet - * @family: protocol family * @ad: the audit data * * Description: @@ -322,7 +302,6 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask) */ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, struct sk_buff *skb, - u16 family, struct avc_audit_data *ad) { int rc; @@ -334,10 +313,16 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec, return 0; netlbl_secattr_init(&secattr); - rc = netlbl_skbuff_getattr(skb, family, &secattr); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) - rc = selinux_netlbl_sidlookup_cached(skb, &secattr, &nlbl_sid); - else + rc = netlbl_skbuff_getattr(skb, &secattr); + if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) { + rc = security_netlbl_secattr_to_sid(&secattr, + SECINITSID_NETMSG, + &nlbl_sid); + if (rc == 0 && + (secattr.flags & NETLBL_SECATTR_CACHEABLE) && + (secattr.flags & NETLBL_SECATTR_CACHE)) + netlbl_cache_add(skb, &secattr); + } else nlbl_sid = SECINITSID_UNLABELED; netlbl_secattr_destroy(&secattr); if (rc != 0) diff --git a/trunk/security/selinux/netnode.c b/trunk/security/selinux/netnode.c deleted file mode 100644 index f3c526f2cacb..000000000000 --- a/trunk/security/selinux/netnode.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Network node table - * - * SELinux must keep a mapping of network nodes to labels/SIDs. This - * mapping is maintained as part of the normal policy but a fast cache is - * needed to reduce the lookup overhead since most of these queries happen on - * a per-packet basis. - * - * Author: Paul Moore - * - * This code is heavily based on the "netif" concept originally developed by - * James Morris - * (see security/selinux/netif.c for more information) - * - */ - -/* - * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "objsec.h" - -#define SEL_NETNODE_HASH_SIZE 256 -#define SEL_NETNODE_HASH_BKT_LIMIT 16 - -struct sel_netnode { - struct netnode_security_struct nsec; - - struct list_head list; - struct rcu_head rcu; -}; - -/* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason - * for this is that I suspect most users will not make heavy use of both - * address families at the same time so one table will usually end up wasted, - * if this becomes a problem we can always add a hash table for each address - * family later */ - -static LIST_HEAD(sel_netnode_list); -static DEFINE_SPINLOCK(sel_netnode_lock); -static struct list_head sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; - -/** - * sel_netnode_free - Frees a node entry - * @p: the entry's RCU field - * - * Description: - * This function is designed to be used as a callback to the call_rcu() - * function so that memory allocated to a hash table node entry can be - * released safely. - * - */ -static void sel_netnode_free(struct rcu_head *p) -{ - struct sel_netnode *node = container_of(p, struct sel_netnode, rcu); - kfree(node); -} - -/** - * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table - * @addr: IPv4 address - * - * Description: - * This is the IPv4 hashing function for the node interface table, it returns - * the bucket number for the given IP address. - * - */ -static u32 sel_netnode_hashfn_ipv4(__be32 addr) -{ - /* at some point we should determine if the mismatch in byte order - * affects the hash function dramatically */ - return (addr & (SEL_NETNODE_HASH_SIZE - 1)); -} - -/** - * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table - * @addr: IPv6 address - * - * Description: - * This is the IPv6 hashing function for the node interface table, it returns - * the bucket number for the given IP address. - * - */ -static u32 sel_netnode_hashfn_ipv6(const struct in6_addr *addr) -{ - /* just hash the least significant 32 bits to keep things fast (they - * are the most likely to be different anyway), we can revisit this - * later if needed */ - return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); -} - -/** - * sel_netnode_find - Search for a node record - * @addr: IP address - * @family: address family - * - * Description: - * Search the network node table and return the record matching @addr. If an - * entry can not be found in the table return NULL. - * - */ -static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) -{ - u32 idx; - struct sel_netnode *node; - - switch (family) { - case PF_INET: - idx = sel_netnode_hashfn_ipv4(*(__be32 *)addr); - break; - case PF_INET6: - idx = sel_netnode_hashfn_ipv6(addr); - break; - default: - BUG(); - } - - list_for_each_entry_rcu(node, &sel_netnode_hash[idx], list) - if (node->nsec.family == family) - switch (family) { - case PF_INET: - if (node->nsec.addr.ipv4 == *(__be32 *)addr) - return node; - break; - case PF_INET6: - if (ipv6_addr_equal(&node->nsec.addr.ipv6, - addr)) - return node; - break; - } - - return NULL; -} - -/** - * sel_netnode_insert - Insert a new node into the table - * @node: the new node record - * - * Description: - * Add a new node record to the network address hash table. Returns zero on - * success, negative values on failure. - * - */ -static int sel_netnode_insert(struct sel_netnode *node) -{ - u32 idx; - u32 count = 0; - struct sel_netnode *iter; - - switch (node->nsec.family) { - case PF_INET: - idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); - break; - case PF_INET6: - idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); - break; - default: - BUG(); - } - list_add_rcu(&node->list, &sel_netnode_hash[idx]); - - /* we need to impose a limit on the growth of the hash table so check - * this bucket to make sure it is within the specified bounds */ - list_for_each_entry(iter, &sel_netnode_hash[idx], list) - if (++count > SEL_NETNODE_HASH_BKT_LIMIT) { - list_del_rcu(&iter->list); - call_rcu(&iter->rcu, sel_netnode_free); - break; - } - - return 0; -} - -/** - * sel_netnode_destroy - Remove a node record from the table - * @node: the existing node record - * - * Description: - * Remove an existing node record from the network address table. - * - */ -static void sel_netnode_destroy(struct sel_netnode *node) -{ - list_del_rcu(&node->list); - call_rcu(&node->rcu, sel_netnode_free); -} - -/** - * sel_netnode_sid_slow - Lookup the SID of a network address using the policy - * @addr: the IP address - * @family: the address family - * @sid: node SID - * - * Description: - * This function determines the SID of a network address by quering the - * security policy. The result is added to the network address table to - * speedup future queries. Returns zero on success, negative values on - * failure. - * - */ -static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) -{ - int ret; - struct sel_netnode *node; - struct sel_netnode *new = NULL; - - spin_lock_bh(&sel_netnode_lock); - node = sel_netnode_find(addr, family); - if (node != NULL) { - *sid = node->nsec.sid; - ret = 0; - goto out; - } - new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) { - ret = -ENOMEM; - goto out; - } - switch (family) { - case PF_INET: - ret = security_node_sid(PF_INET, - addr, sizeof(struct in_addr), - &new->nsec.sid); - new->nsec.addr.ipv4 = *(__be32 *)addr; - break; - case PF_INET6: - ret = security_node_sid(PF_INET6, - addr, sizeof(struct in6_addr), - &new->nsec.sid); - ipv6_addr_copy(&new->nsec.addr.ipv6, addr); - break; - default: - BUG(); - } - if (ret != 0) - goto out; - new->nsec.family = family; - ret = sel_netnode_insert(new); - if (ret != 0) - goto out; - *sid = new->nsec.sid; - -out: - spin_unlock_bh(&sel_netnode_lock); - if (unlikely(ret)) { - printk(KERN_WARNING - "SELinux: failure in sel_netnode_sid_slow()," - " unable to determine network node label\n"); - kfree(new); - } - return ret; -} - -/** - * sel_netnode_sid - Lookup the SID of a network address - * @addr: the IP address - * @family: the address family - * @sid: node SID - * - * Description: - * This function determines the SID of a network address using the fastest - * method possible. First the address table is queried, but if an entry - * can't be found then the policy is queried and the result is added to the - * table to speedup future queries. Returns zero on success, negative values - * on failure. - * - */ -int sel_netnode_sid(void *addr, u16 family, u32 *sid) -{ - struct sel_netnode *node; - - rcu_read_lock(); - node = sel_netnode_find(addr, family); - if (node != NULL) { - *sid = node->nsec.sid; - rcu_read_unlock(); - return 0; - } - rcu_read_unlock(); - - return sel_netnode_sid_slow(addr, family, sid); -} - -/** - * sel_netnode_flush - Flush the entire network address table - * - * Description: - * Remove all entries from the network address table. - * - */ -static void sel_netnode_flush(void) -{ - u32 idx; - struct sel_netnode *node; - - spin_lock_bh(&sel_netnode_lock); - for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) - list_for_each_entry(node, &sel_netnode_hash[idx], list) - sel_netnode_destroy(node); - spin_unlock_bh(&sel_netnode_lock); -} - -static int sel_netnode_avc_callback(u32 event, u32 ssid, u32 tsid, - u16 class, u32 perms, u32 *retained) -{ - if (event == AVC_CALLBACK_RESET) { - sel_netnode_flush(); - synchronize_net(); - } - return 0; -} - -static __init int sel_netnode_init(void) -{ - int iter; - int ret; - - if (!selinux_enabled) - return 0; - - for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) - INIT_LIST_HEAD(&sel_netnode_hash[iter]); - - ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET, - SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0); - if (ret != 0) - panic("avc_add_callback() failed, error %d\n", ret); - - return ret; -} - -__initcall(sel_netnode_init); diff --git a/trunk/security/selinux/selinuxfs.c b/trunk/security/selinux/selinuxfs.c index a85740530afc..397fd4955fe1 100644 --- a/trunk/security/selinux/selinuxfs.c +++ b/trunk/security/selinux/selinuxfs.c @@ -2,11 +2,6 @@ * * Added conditional policy language extensions * - * Updated: Hewlett-Packard - * - * Added support for the policy capability bitmap - * - * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * Copyright (C) 2004 Red Hat, Inc., James Morris * This program is free software; you can redistribute it and/or modify @@ -40,11 +35,6 @@ #include "objsec.h" #include "conditional.h" -/* Policy capability filenames */ -static char *policycap_names[] = { - "network_peer_controls" -}; - unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; #ifdef CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT @@ -82,9 +72,6 @@ static int *bool_pending_values = NULL; static struct dentry *class_dir = NULL; static unsigned long last_class_ino; -/* global data for policy capabilities */ -static struct dentry *policycap_dir = NULL; - extern void selnl_notify_setenforce(int val); /* Check whether a task is allowed to use a security operation. */ @@ -124,11 +111,10 @@ enum sel_inos { static unsigned long sel_last_ino = SEL_INO_NEXT - 1; -#define SEL_INITCON_INO_OFFSET 0x01000000 -#define SEL_BOOL_INO_OFFSET 0x02000000 -#define SEL_CLASS_INO_OFFSET 0x04000000 -#define SEL_POLICYCAP_INO_OFFSET 0x08000000 -#define SEL_INO_MASK 0x00ffffff +#define SEL_INITCON_INO_OFFSET 0x01000000 +#define SEL_BOOL_INO_OFFSET 0x02000000 +#define SEL_CLASS_INO_OFFSET 0x04000000 +#define SEL_INO_MASK 0x00ffffff #define TMPBUFLEN 12 static ssize_t sel_read_enforce(struct file *filp, char __user *buf, @@ -277,7 +263,6 @@ static const struct file_operations sel_policyvers_ops = { /* declaration for sel_write_load */ static int sel_make_bools(void); static int sel_make_classes(void); -static int sel_make_policycap(void); /* declaration for sel_make_class_dirs */ static int sel_make_dir(struct inode *dir, struct dentry *dentry, @@ -338,12 +323,6 @@ static ssize_t sel_write_load(struct file * file, const char __user * buf, } ret = sel_make_classes(); - if (ret) { - length = ret; - goto out1; - } - - ret = sel_make_policycap(); if (ret) length = ret; else @@ -1420,24 +1399,6 @@ static const struct file_operations sel_perm_ops = { .read = sel_read_perm, }; -static ssize_t sel_read_policycap(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - int value; - char tmpbuf[TMPBUFLEN]; - ssize_t length; - unsigned long i_ino = file->f_path.dentry->d_inode->i_ino; - - value = security_policycap_supported(i_ino & SEL_INO_MASK); - length = scnprintf(tmpbuf, TMPBUFLEN, "%d", value); - - return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); -} - -static const struct file_operations sel_policycap_ops = { - .read = sel_read_policycap, -}; - static int sel_make_perm_files(char *objclass, int classvalue, struct dentry *dir) { @@ -1584,36 +1545,6 @@ static int sel_make_classes(void) return rc; } -static int sel_make_policycap(void) -{ - unsigned int iter; - struct dentry *dentry = NULL; - struct inode *inode = NULL; - - sel_remove_entries(policycap_dir); - - for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) { - if (iter < ARRAY_SIZE(policycap_names)) - dentry = d_alloc_name(policycap_dir, - policycap_names[iter]); - else - dentry = d_alloc_name(policycap_dir, "unknown"); - - if (dentry == NULL) - return -ENOMEM; - - inode = sel_make_inode(policycap_dir->d_sb, S_IFREG | S_IRUGO); - if (inode == NULL) - return -ENOMEM; - - inode->i_fop = &sel_policycap_ops; - inode->i_ino = iter | SEL_POLICYCAP_INO_OFFSET; - d_add(dentry, inode); - } - - return 0; -} - static int sel_make_dir(struct inode *dir, struct dentry *dentry, unsigned long *ino) { @@ -1742,18 +1673,6 @@ static int sel_fill_super(struct super_block * sb, void * data, int silent) class_dir = dentry; - dentry = d_alloc_name(sb->s_root, "policy_capabilities"); - if (!dentry) { - ret = -ENOMEM; - goto err; - } - - ret = sel_make_dir(root_inode, dentry, &sel_last_ino); - if (ret) - goto err; - - policycap_dir = dentry; - out: return ret; err: diff --git a/trunk/security/selinux/ss/mls.c b/trunk/security/selinux/ss/mls.c index feaf0a5b828f..3bbcb5369af9 100644 --- a/trunk/security/selinux/ss/mls.c +++ b/trunk/security/selinux/ss/mls.c @@ -562,7 +562,7 @@ void mls_export_netlbl_lvl(struct context *context, if (!selinux_mls_enabled) return; - secattr->attr.mls.lvl = context->range.level[0].sens - 1; + secattr->mls_lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } @@ -582,7 +582,7 @@ void mls_import_netlbl_lvl(struct context *context, if (!selinux_mls_enabled) return; - context->range.level[0].sens = secattr->attr.mls.lvl + 1; + context->range.level[0].sens = secattr->mls_lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } @@ -605,8 +605,8 @@ int mls_export_netlbl_cat(struct context *context, return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, - &secattr->attr.mls.cat); - if (rc == 0 && secattr->attr.mls.cat != NULL) + &secattr->mls_cat); + if (rc == 0 && secattr->mls_cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; @@ -633,7 +633,7 @@ int mls_import_netlbl_cat(struct context *context, return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, - secattr->attr.mls.cat); + secattr->mls_cat); if (rc != 0) goto import_netlbl_cat_failure; diff --git a/trunk/security/selinux/ss/policydb.c b/trunk/security/selinux/ss/policydb.c index bd7d6a00342d..b582aae3c62c 100644 --- a/trunk/security/selinux/ss/policydb.c +++ b/trunk/security/selinux/ss/policydb.c @@ -13,11 +13,6 @@ * * Added conditional policy language extensions * - * Updated: Hewlett-Packard - * - * Added support for the policy capability bitmap - * - * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify @@ -107,11 +102,6 @@ static struct policydb_compat_info policydb_compat[] = { .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, - { - .version = POLICYDB_VERSION_POLCAP, - .sym_num = SYM_NUM, - .ocon_num = OCON_NUM, - } }; static struct policydb_compat_info *policydb_lookup_compat(int version) @@ -193,8 +183,6 @@ static int policydb_init(struct policydb *p) if (rc) goto out_free_symtab; - ebitmap_init(&p->policycaps); - out: return rc; @@ -685,8 +673,8 @@ void policydb_destroy(struct policydb *p) ebitmap_destroy(&p->type_attr_map[i]); } kfree(p->type_attr_map); + kfree(p->undefined_perms); - ebitmap_destroy(&p->policycaps); return; } @@ -1566,10 +1554,6 @@ int policydb_read(struct policydb *p, void *fp) p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); - if (p->policyvers >= POLICYDB_VERSION_POLCAP && - ebitmap_read(&p->policycaps, fp) != 0) - goto bad; - info = policydb_lookup_compat(p->policyvers); if (!info) { printk(KERN_ERR "security: unable to find policy compat info " diff --git a/trunk/security/selinux/ss/policydb.h b/trunk/security/selinux/ss/policydb.h index c4ce996e202c..ed6fc687c66f 100644 --- a/trunk/security/selinux/ss/policydb.h +++ b/trunk/security/selinux/ss/policydb.h @@ -241,8 +241,6 @@ struct policydb { /* type -> attribute reverse mapping */ struct ebitmap *type_attr_map; - struct ebitmap policycaps; - unsigned int policyvers; unsigned int reject_unknown : 1; diff --git a/trunk/security/selinux/ss/services.c b/trunk/security/selinux/ss/services.c index f96dec1f9258..4bf715d4cf29 100644 --- a/trunk/security/selinux/ss/services.c +++ b/trunk/security/selinux/ss/services.c @@ -16,13 +16,12 @@ * Updated: Hewlett-Packard * * Added support for NetLabel - * Added support for the policy capability bitmap * * Updated: Chad Sellers * * Added validation of kernel classes and permissions * - * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2006 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris @@ -60,8 +59,6 @@ extern void selnl_notify_policyload(u32 seqno); unsigned int policydb_loaded_version; -int selinux_policycap_netpeer; - /* * This is declared in avc.c */ @@ -1302,12 +1299,6 @@ static int convert_context(u32 key, goto out; } -static void security_load_policycaps(void) -{ - selinux_policycap_netpeer = ebitmap_get_bit(&policydb.policycaps, - POLICYDB_CAPABILITY_NETPEER); -} - extern void selinux_complete_init(void); static int security_preserve_bools(struct policydb *p); @@ -1355,7 +1346,6 @@ int security_load_policy(void *data, size_t len) avtab_cache_destroy(); return -EINVAL; } - security_load_policycaps(); policydb_loaded_version = policydb.policyvers; ss_initialized = 1; seqno = ++latest_granting; @@ -1414,7 +1404,6 @@ int security_load_policy(void *data, size_t len) POLICY_WRLOCK; memcpy(&policydb, &newpolicydb, sizeof policydb); sidtab_set(&sidtab, &newsidtab); - security_load_policycaps(); seqno = ++latest_granting; policydb_loaded_version = policydb.policyvers; POLICY_WRUNLOCK; @@ -1489,8 +1478,11 @@ int security_port_sid(u16 domain, * security_netif_sid - Obtain the SID for a network interface. * @name: interface name * @if_sid: interface SID + * @msg_sid: default SID for received packets */ -int security_netif_sid(char *name, u32 *if_sid) +int security_netif_sid(char *name, + u32 *if_sid, + u32 *msg_sid) { int rc = 0; struct ocontext *c; @@ -1518,8 +1510,11 @@ int security_netif_sid(char *name, u32 *if_sid) goto out; } *if_sid = c->sid[0]; - } else + *msg_sid = c->sid[1]; + } else { *if_sid = SECINITSID_NETIF; + *msg_sid = SECINITSID_NETMSG; + } out: POLICY_RDUNLOCK; @@ -2054,91 +2049,6 @@ int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid) return rc; } -/** - * security_net_peersid_resolve - Compare and resolve two network peer SIDs - * @nlbl_sid: NetLabel SID - * @nlbl_type: NetLabel labeling protocol type - * @xfrm_sid: XFRM SID - * - * Description: - * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be - * resolved into a single SID it is returned via @peer_sid and the function - * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function - * returns a negative value. A table summarizing the behavior is below: - * - * | function return | @sid - * ------------------------------+-----------------+----------------- - * no peer labels | 0 | SECSID_NULL - * single peer label | 0 | - * multiple, consistent labels | 0 | - * multiple, inconsistent labels | - | SECSID_NULL - * - */ -int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, - u32 xfrm_sid, - u32 *peer_sid) -{ - int rc; - struct context *nlbl_ctx; - struct context *xfrm_ctx; - - /* handle the common (which also happens to be the set of easy) cases - * right away, these two if statements catch everything involving a - * single or absent peer SID/label */ - if (xfrm_sid == SECSID_NULL) { - *peer_sid = nlbl_sid; - return 0; - } - /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label - * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label - * is present */ - if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { - *peer_sid = xfrm_sid; - return 0; - } - - /* we don't need to check ss_initialized here since the only way both - * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the - * security server was initialized and ss_initialized was true */ - if (!selinux_mls_enabled) { - *peer_sid = SECSID_NULL; - return 0; - } - - POLICY_RDLOCK; - - nlbl_ctx = sidtab_search(&sidtab, nlbl_sid); - if (!nlbl_ctx) { - printk(KERN_ERR - "security_sid_mls_cmp: unrecognized SID %d\n", - nlbl_sid); - rc = -EINVAL; - goto out_slowpath; - } - xfrm_ctx = sidtab_search(&sidtab, xfrm_sid); - if (!xfrm_ctx) { - printk(KERN_ERR - "security_sid_mls_cmp: unrecognized SID %d\n", - xfrm_sid); - rc = -EINVAL; - goto out_slowpath; - } - rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); - -out_slowpath: - POLICY_RDUNLOCK; - if (rc == 0) - /* at present NetLabel SIDs/labels really only carry MLS - * information so if the MLS portion of the NetLabel SID - * matches the MLS portion of the labeled XFRM SID/label - * then pass along the XFRM SID as it is the most - * expressive */ - *peer_sid = xfrm_sid; - else - *peer_sid = SECSID_NULL; - return rc; -} - static int get_classes_callback(void *k, void *d, void *args) { struct class_datum *datum = d; @@ -2244,60 +2154,6 @@ int security_get_allow_unknown(void) return policydb.allow_unknown; } -/** - * security_get_policycaps - Query the loaded policy for its capabilities - * @len: the number of capability bits - * @values: the capability bit array - * - * Description: - * Get an array of the policy capabilities in @values where each entry in - * @values is either true (1) or false (0) depending the policy's support of - * that feature. The policy capabilities are defined by the - * POLICYDB_CAPABILITY_* enums. The size of the array is stored in @len and it - * is up to the caller to free the array in @values. Returns zero on success, - * negative values on failure. - * - */ -int security_get_policycaps(int *len, int **values) -{ - int rc = -ENOMEM; - unsigned int iter; - - POLICY_RDLOCK; - - *values = kcalloc(POLICYDB_CAPABILITY_MAX, sizeof(int), GFP_ATOMIC); - if (*values == NULL) - goto out; - for (iter = 0; iter < POLICYDB_CAPABILITY_MAX; iter++) - (*values)[iter] = ebitmap_get_bit(&policydb.policycaps, iter); - *len = POLICYDB_CAPABILITY_MAX; - -out: - POLICY_RDUNLOCK; - return rc; -} - -/** - * security_policycap_supported - Check for a specific policy capability - * @req_cap: capability - * - * Description: - * This function queries the currently loaded policy to see if it supports the - * capability specified by @req_cap. Returns true (1) if the capability is - * supported, false (0) if it isn't supported. - * - */ -int security_policycap_supported(unsigned int req_cap) -{ - int rc; - - POLICY_RDLOCK; - rc = ebitmap_get_bit(&policydb.policycaps, req_cap); - POLICY_RDUNLOCK; - - return rc; -} - struct selinux_audit_rule { u32 au_seqno; struct context au_ctxt; @@ -2547,10 +2403,50 @@ void selinux_audit_set_callback(int (*callback)(void)) } #ifdef CONFIG_NETLABEL +/* + * NetLabel cache structure + */ +#define NETLBL_CACHE(x) ((struct selinux_netlbl_cache *)(x)) +#define NETLBL_CACHE_T_NONE 0 +#define NETLBL_CACHE_T_SID 1 +#define NETLBL_CACHE_T_MLS 2 +struct selinux_netlbl_cache { + u32 type; + union { + u32 sid; + struct mls_range mls_label; + } data; +}; + +/** + * security_netlbl_cache_free - Free the NetLabel cached data + * @data: the data to free + * + * Description: + * This function is intended to be used as the free() callback inside the + * netlbl_lsm_cache structure. + * + */ +static void security_netlbl_cache_free(const void *data) +{ + struct selinux_netlbl_cache *cache; + + if (data == NULL) + return; + + cache = NETLBL_CACHE(data); + switch (cache->type) { + case NETLBL_CACHE_T_MLS: + ebitmap_destroy(&cache->data.mls_label.level[0].cat); + break; + } + kfree(data); +} + /** * security_netlbl_cache_add - Add an entry to the NetLabel cache * @secattr: the NetLabel packet security attributes - * @sid: the SELinux SID + * @ctx: the SELinux context * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in @@ -2559,46 +2455,60 @@ void selinux_audit_set_callback(int (*callback)(void)) * */ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, - u32 sid) + struct context *ctx) { - u32 *sid_cache; + struct selinux_netlbl_cache *cache = NULL; - sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); - if (sid_cache == NULL) - return; secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); - if (secattr->cache == NULL) { - kfree(sid_cache); + if (secattr->cache == NULL) + return; + + cache = kzalloc(sizeof(*cache), GFP_ATOMIC); + if (cache == NULL) + return; + + cache->type = NETLBL_CACHE_T_MLS; + if (ebitmap_cpy(&cache->data.mls_label.level[0].cat, + &ctx->range.level[0].cat) != 0) { + kfree(cache); return; } + cache->data.mls_label.level[1].cat.highbit = + cache->data.mls_label.level[0].cat.highbit; + cache->data.mls_label.level[1].cat.node = + cache->data.mls_label.level[0].cat.node; + cache->data.mls_label.level[0].sens = ctx->range.level[0].sens; + cache->data.mls_label.level[1].sens = ctx->range.level[0].sens; - *sid_cache = sid; - secattr->cache->free = kfree; - secattr->cache->data = sid_cache; + secattr->cache->free = security_netlbl_cache_free; + secattr->cache->data = (void *)cache; secattr->flags |= NETLBL_SECATTR_CACHE; } /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes + * @base_sid: the SELinux SID to use as a context for MLS only attributes * @sid: the SELinux SID * * Description: * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux - * SID/context then use SECINITSID_NETMSG as the foundation. If possibile the - * 'cache' field of @secattr is set and the CACHE flag is set; this is to - * allow the @secattr to be used by NetLabel to cache the secattr to SID - * conversion for future lookups. Returns zero on success, negative values on - * failure. + * SID/context then use the context in @base_sid as the foundation. If + * possibile the 'cache' field of @secattr is set and the CACHE flag is set; + * this is to allow the @secattr to be used by NetLabel to cache the secattr to + * SID conversion for future lookups. Returns zero on success, negative + * values on failure. * */ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, + u32 base_sid, u32 *sid) { int rc = -EIDRM; struct context *ctx; struct context ctx_new; + struct selinux_netlbl_cache *cache; if (!ss_initialized) { *sid = SECSID_NULL; @@ -2608,13 +2518,40 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, POLICY_RDLOCK; if (secattr->flags & NETLBL_SECATTR_CACHE) { - *sid = *(u32 *)secattr->cache->data; - rc = 0; - } else if (secattr->flags & NETLBL_SECATTR_SECID) { - *sid = secattr->attr.secid; - rc = 0; + cache = NETLBL_CACHE(secattr->cache->data); + switch (cache->type) { + case NETLBL_CACHE_T_SID: + *sid = cache->data.sid; + rc = 0; + break; + case NETLBL_CACHE_T_MLS: + ctx = sidtab_search(&sidtab, base_sid); + if (ctx == NULL) + goto netlbl_secattr_to_sid_return; + + ctx_new.user = ctx->user; + ctx_new.role = ctx->role; + ctx_new.type = ctx->type; + ctx_new.range.level[0].sens = + cache->data.mls_label.level[0].sens; + ctx_new.range.level[0].cat.highbit = + cache->data.mls_label.level[0].cat.highbit; + ctx_new.range.level[0].cat.node = + cache->data.mls_label.level[0].cat.node; + ctx_new.range.level[1].sens = + cache->data.mls_label.level[1].sens; + ctx_new.range.level[1].cat.highbit = + cache->data.mls_label.level[1].cat.highbit; + ctx_new.range.level[1].cat.node = + cache->data.mls_label.level[1].cat.node; + + rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid); + break; + default: + goto netlbl_secattr_to_sid_return; + } } else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { - ctx = sidtab_search(&sidtab, SECINITSID_NETMSG); + ctx = sidtab_search(&sidtab, base_sid); if (ctx == NULL) goto netlbl_secattr_to_sid_return; @@ -2624,7 +2561,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, mls_import_netlbl_lvl(&ctx_new, secattr); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { if (ebitmap_netlbl_import(&ctx_new.range.level[0].cat, - secattr->attr.mls.cat) != 0) + secattr->mls_cat) != 0) goto netlbl_secattr_to_sid_return; ctx_new.range.level[1].cat.highbit = ctx_new.range.level[0].cat.highbit; @@ -2641,7 +2578,7 @@ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, if (rc != 0) goto netlbl_secattr_to_sid_return_cleanup; - security_netlbl_cache_add(secattr, *sid); + security_netlbl_cache_add(secattr, &ctx_new); ebitmap_destroy(&ctx_new.range.level[0].cat); } else { diff --git a/trunk/security/selinux/xfrm.c b/trunk/security/selinux/xfrm.c index 7e158205d081..e07603969033 100644 --- a/trunk/security/selinux/xfrm.c +++ b/trunk/security/selinux/xfrm.c @@ -46,14 +46,11 @@ #include #include #include -#include #include "avc.h" #include "objsec.h" #include "xfrm.h" -/* Labeled XFRM instance counter */ -atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0); /* * Returns true if an LSM/SELinux context @@ -296,9 +293,6 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, BUG_ON(!uctx); err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, 0); - if (err == 0) - atomic_inc(&selinux_xfrm_refcount); - return err; } @@ -346,13 +340,10 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp) struct xfrm_sec_ctx *ctx = xp->security; int rc = 0; - if (ctx) { + if (ctx) rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); - if (rc == 0) - atomic_dec(&selinux_xfrm_refcount); - } return rc; } @@ -369,8 +360,6 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct BUG_ON(!x); err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); - if (err == 0) - atomic_inc(&selinux_xfrm_refcount); return err; } @@ -393,13 +382,10 @@ int selinux_xfrm_state_delete(struct xfrm_state *x) struct xfrm_sec_ctx *ctx = x->security; int rc = 0; - if (ctx) { + if (ctx) rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); - if (rc == 0) - atomic_dec(&selinux_xfrm_refcount); - } return rc; } diff --git a/trunk/virt/kvm/ioapic.h b/trunk/virt/kvm/ioapic.h deleted file mode 100644 index 7f16675fe783..000000000000 --- a/trunk/virt/kvm/ioapic.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef __KVM_IO_APIC_H -#define __KVM_IO_APIC_H - -#include - -#include "iodev.h" - -struct kvm; -struct kvm_vcpu; - -#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS -#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ -#define IOAPIC_EDGE_TRIG 0 -#define IOAPIC_LEVEL_TRIG 1 - -#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 -#define IOAPIC_MEM_LENGTH 0x100 - -/* Direct registers. */ -#define IOAPIC_REG_SELECT 0x00 -#define IOAPIC_REG_WINDOW 0x10 -#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ - -/* Indirect registers. */ -#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ -#define IOAPIC_REG_VERSION 0x01 -#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ - -/*ioapic delivery mode*/ -#define IOAPIC_FIXED 0x0 -#define IOAPIC_LOWEST_PRIORITY 0x1 -#define IOAPIC_PMI 0x2 -#define IOAPIC_NMI 0x4 -#define IOAPIC_INIT 0x5 -#define IOAPIC_EXTINT 0x7 - -struct kvm_ioapic { - u64 base_address; - u32 ioregsel; - u32 id; - u32 irr; - u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; - struct kvm_io_device dev; - struct kvm *kvm; -}; - -#ifdef DEBUG -#define ASSERT(x) \ -do { \ - if (!(x)) { \ - printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ - __FILE__, __LINE__, #x); \ - BUG(); \ - } \ -} while (0) -#else -#define ASSERT(x) do { } while (0) -#endif - -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - -#ifdef CONFIG_IA64 -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} -#endif - -struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap); -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); -int kvm_ioapic_init(struct kvm *kvm); -void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); -void kvm_ioapic_reset(struct kvm_ioapic *ioapic); - -#endif diff --git a/trunk/virt/kvm/iodev.h b/trunk/virt/kvm/iodev.h deleted file mode 100644 index c14e642027b2..000000000000 --- a/trunk/virt/kvm/iodev.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef __KVM_IODEV_H__ -#define __KVM_IODEV_H__ - -#include - -struct kvm_io_device { - void (*read)(struct kvm_io_device *this, - gpa_t addr, - int len, - void *val); - void (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - int (*in_range)(struct kvm_io_device *this, gpa_t addr); - void (*destructor)(struct kvm_io_device *this); - - void *private; -}; - -static inline void kvm_iodevice_read(struct kvm_io_device *dev, - gpa_t addr, - int len, - void *val) -{ - dev->read(dev, addr, len, val); -} - -static inline void kvm_iodevice_write(struct kvm_io_device *dev, - gpa_t addr, - int len, - const void *val) -{ - dev->write(dev, addr, len, val); -} - -static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) -{ - return dev->in_range(dev, addr); -} - -static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) -{ - if (dev->destructor) - dev->destructor(dev); -} - -#endif /* __KVM_IODEV_H__ */ diff --git a/trunk/virt/kvm/kvm_main.c b/trunk/virt/kvm/kvm_main.c deleted file mode 100644 index 3c4fe26096fc..000000000000 --- a/trunk/virt/kvm/kvm_main.c +++ /dev/null @@ -1,1400 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * - * Authors: - * Avi Kivity - * Yaniv Kamay - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "iodev.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -DEFINE_SPINLOCK(kvm_lock); -LIST_HEAD(vm_list); - -static cpumask_t cpus_hardware_enabled; - -struct kmem_cache *kvm_vcpu_cache; -EXPORT_SYMBOL_GPL(kvm_vcpu_cache); - -static __read_mostly struct preempt_ops kvm_preempt_ops; - -static struct dentry *debugfs_dir; - -static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, - unsigned long arg); - -static inline int valid_vcpu(int n) -{ - return likely(n >= 0 && n < KVM_MAX_VCPUS); -} - -/* - * Switches to specified vcpu, until a matching vcpu_put() - */ -void vcpu_load(struct kvm_vcpu *vcpu) -{ - int cpu; - - mutex_lock(&vcpu->mutex); - cpu = get_cpu(); - preempt_notifier_register(&vcpu->preempt_notifier); - kvm_arch_vcpu_load(vcpu, cpu); - put_cpu(); -} - -void vcpu_put(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_arch_vcpu_put(vcpu); - preempt_notifier_unregister(&vcpu->preempt_notifier); - preempt_enable(); - mutex_unlock(&vcpu->mutex); -} - -static void ack_flush(void *_completed) -{ -} - -void kvm_flush_remote_tlbs(struct kvm *kvm) -{ - int i, cpu; - cpumask_t cpus; - struct kvm_vcpu *vcpu; - - cpus_clear(cpus); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; - if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - continue; - cpu = vcpu->cpu; - if (cpu != -1 && cpu != raw_smp_processor_id()) - cpu_set(cpu, cpus); - } - if (cpus_empty(cpus)) - return; - ++kvm->stat.remote_tlb_flush; - smp_call_function_mask(cpus, ack_flush, NULL, 1); -} - -int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) -{ - struct page *page; - int r; - - mutex_init(&vcpu->mutex); - vcpu->cpu = -1; - vcpu->kvm = kvm; - vcpu->vcpu_id = id; - init_waitqueue_head(&vcpu->wq); - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->run = page_address(page); - - r = kvm_arch_vcpu_init(vcpu); - if (r < 0) - goto fail_free_run; - return 0; - -fail_free_run: - free_page((unsigned long)vcpu->run); -fail: - return r; -} -EXPORT_SYMBOL_GPL(kvm_vcpu_init); - -void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_uninit(vcpu); - free_page((unsigned long)vcpu->run); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); - -static struct kvm *kvm_create_vm(void) -{ - struct kvm *kvm = kvm_arch_create_vm(); - - if (IS_ERR(kvm)) - goto out; - - kvm->mm = current->mm; - atomic_inc(&kvm->mm->mm_count); - spin_lock_init(&kvm->mmu_lock); - kvm_io_bus_init(&kvm->pio_bus); - mutex_init(&kvm->lock); - kvm_io_bus_init(&kvm->mmio_bus); - spin_lock(&kvm_lock); - list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); -out: - return kvm; -} - -/* - * Free any memory in @free but not in @dont. - */ -static void kvm_free_physmem_slot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ - if (!dont || free->rmap != dont->rmap) - vfree(free->rmap); - - if (!dont || free->dirty_bitmap != dont->dirty_bitmap) - vfree(free->dirty_bitmap); - - free->npages = 0; - free->dirty_bitmap = NULL; - free->rmap = NULL; -} - -void kvm_free_physmem(struct kvm *kvm) -{ - int i; - - for (i = 0; i < kvm->nmemslots; ++i) - kvm_free_physmem_slot(&kvm->memslots[i], NULL); -} - -static void kvm_destroy_vm(struct kvm *kvm) -{ - struct mm_struct *mm = kvm->mm; - - spin_lock(&kvm_lock); - list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); - kvm_io_bus_destroy(&kvm->pio_bus); - kvm_io_bus_destroy(&kvm->mmio_bus); - kvm_arch_destroy_vm(kvm); - mmdrop(mm); -} - -static int kvm_vm_release(struct inode *inode, struct file *filp) -{ - struct kvm *kvm = filp->private_data; - - kvm_destroy_vm(kvm); - return 0; -} - -/* - * Allocate some memory and give it an address in the guest physical address - * space. - * - * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding mmap_sem for write. - */ -int __kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc) -{ - int r; - gfn_t base_gfn; - unsigned long npages; - unsigned long i; - struct kvm_memory_slot *memslot; - struct kvm_memory_slot old, new; - - r = -EINVAL; - /* General sanity checks */ - if (mem->memory_size & (PAGE_SIZE - 1)) - goto out; - if (mem->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) - goto out; - if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) - goto out; - - memslot = &kvm->memslots[mem->slot]; - base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; - npages = mem->memory_size >> PAGE_SHIFT; - - if (!npages) - mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; - - new = old = *memslot; - - new.base_gfn = base_gfn; - new.npages = npages; - new.flags = mem->flags; - - /* Disallow changing a memory slot's size. */ - r = -EINVAL; - if (npages && old.npages && npages != old.npages) - goto out_free; - - /* Check for overlaps */ - r = -EEXIST; - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *s = &kvm->memslots[i]; - - if (s == memslot) - continue; - if (!((base_gfn + npages <= s->base_gfn) || - (base_gfn >= s->base_gfn + s->npages))) - goto out_free; - } - - /* Free page dirty bitmap if unneeded */ - if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) - new.dirty_bitmap = NULL; - - r = -ENOMEM; - - /* Allocate if a slot is being created */ - if (npages && !new.rmap) { - new.rmap = vmalloc(npages * sizeof(struct page *)); - - if (!new.rmap) - goto out_free; - - memset(new.rmap, 0, npages * sizeof(*new.rmap)); - - new.user_alloc = user_alloc; - new.userspace_addr = mem->userspace_addr; - } - - /* Allocate page dirty bitmap if needed */ - if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { - unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; - - new.dirty_bitmap = vmalloc(dirty_bytes); - if (!new.dirty_bitmap) - goto out_free; - memset(new.dirty_bitmap, 0, dirty_bytes); - } - - if (mem->slot >= kvm->nmemslots) - kvm->nmemslots = mem->slot + 1; - - *memslot = new; - - r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); - if (r) { - *memslot = old; - goto out_free; - } - - kvm_free_physmem_slot(&old, &new); - return 0; - -out_free: - kvm_free_physmem_slot(&new, &old); -out: - return r; - -} -EXPORT_SYMBOL_GPL(__kvm_set_memory_region); - -int kvm_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - int user_alloc) -{ - int r; - - down_write(¤t->mm->mmap_sem); - r = __kvm_set_memory_region(kvm, mem, user_alloc); - up_write(¤t->mm->mmap_sem); - return r; -} -EXPORT_SYMBOL_GPL(kvm_set_memory_region); - -int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct - kvm_userspace_memory_region *mem, - int user_alloc) -{ - if (mem->slot >= KVM_MEMORY_SLOTS) - return -EINVAL; - return kvm_set_memory_region(kvm, mem, user_alloc); -} - -int kvm_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log, int *is_dirty) -{ - struct kvm_memory_slot *memslot; - int r, i; - int n; - unsigned long any = 0; - - r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) - goto out; - - memslot = &kvm->memslots[log->slot]; - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - - for (i = 0; !any && i < n/sizeof(long); ++i) - any = memslot->dirty_bitmap[i]; - - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) - goto out; - - if (any) - *is_dirty = 1; - - r = 0; -out: - return r; -} - -int is_error_page(struct page *page) -{ - return page == bad_page; -} -EXPORT_SYMBOL_GPL(is_error_page); - -static inline unsigned long bad_hva(void) -{ - return PAGE_OFFSET; -} - -int kvm_is_error_hva(unsigned long addr) -{ - return addr == bad_hva(); -} -EXPORT_SYMBOL_GPL(kvm_is_error_hva); - -static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) -{ - int i; - - for (i = 0; i < kvm->nmemslots; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; - - if (gfn >= memslot->base_gfn - && gfn < memslot->base_gfn + memslot->npages) - return memslot; - } - return NULL; -} - -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) -{ - gfn = unalias_gfn(kvm, gfn); - return __gfn_to_memslot(kvm, gfn); -} - -int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - - gfn = unalias_gfn(kvm, gfn); - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; - - if (gfn >= memslot->base_gfn - && gfn < memslot->base_gfn + memslot->npages) - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); - -static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - - gfn = unalias_gfn(kvm, gfn); - slot = __gfn_to_memslot(kvm, gfn); - if (!slot) - return bad_hva(); - return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); -} - -/* - * Requires current->mm->mmap_sem to be held - */ -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct page *page[1]; - unsigned long addr; - int npages; - - might_sleep(); - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return bad_page; - } - - npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, - NULL); - - if (npages != 1) { - get_page(bad_page); - return bad_page; - } - - return page[0]; -} - -EXPORT_SYMBOL_GPL(gfn_to_page); - -void kvm_release_page_clean(struct page *page) -{ - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - -void kvm_release_page_dirty(struct page *page) -{ - if (!PageReserved(page)) - SetPageDirty(page); - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - -static int next_segment(unsigned long len, int offset) -{ - if (len > PAGE_SIZE - offset) - return PAGE_SIZE - offset; - else - return len; -} - -int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, - int len) -{ - int r; - unsigned long addr; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return -EFAULT; - r = copy_from_user(data, (void __user *)addr + offset, len); - if (r) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page); - -int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - int seg; - int offset = offset_in_page(gpa); - int ret; - - while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); - if (ret < 0) - return ret; - offset = 0; - len -= seg; - data += seg; - ++gfn; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_read_guest); - -int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, - unsigned long len) -{ - int r; - unsigned long addr; - gfn_t gfn = gpa >> PAGE_SHIFT; - int offset = offset_in_page(gpa); - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return -EFAULT; - r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); - if (r) - return -EFAULT; - return 0; -} -EXPORT_SYMBOL(kvm_read_guest_atomic); - -int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, - int offset, int len) -{ - int r; - unsigned long addr; - - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return -EFAULT; - r = copy_to_user((void __user *)addr + offset, data, len); - if (r) - return -EFAULT; - mark_page_dirty(kvm, gfn); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_write_guest_page); - -int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, - unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - int seg; - int offset = offset_in_page(gpa); - int ret; - - while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); - if (ret < 0) - return ret; - offset = 0; - len -= seg; - data += seg; - ++gfn; - } - return 0; -} - -int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) -{ - return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_clear_guest_page); - -int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - int seg; - int offset = offset_in_page(gpa); - int ret; - - while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_clear_guest_page(kvm, gfn, offset, seg); - if (ret < 0) - return ret; - offset = 0; - len -= seg; - ++gfn; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_clear_guest); - -void mark_page_dirty(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *memslot; - - gfn = unalias_gfn(kvm, gfn); - memslot = __gfn_to_memslot(kvm, gfn); - if (memslot && memslot->dirty_bitmap) { - unsigned long rel_gfn = gfn - memslot->base_gfn; - - /* avoid RMW */ - if (!test_bit(rel_gfn, memslot->dirty_bitmap)) - set_bit(rel_gfn, memslot->dirty_bitmap); - } -} - -/* - * The vCPU has executed a HLT instruction with in-kernel mode enabled. - */ -void kvm_vcpu_block(struct kvm_vcpu *vcpu) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&vcpu->wq, &wait); - - /* - * We will block until either an interrupt or a signal wakes us up - */ - while (!kvm_cpu_has_interrupt(vcpu) - && !signal_pending(current) - && !kvm_arch_vcpu_runnable(vcpu)) { - set_current_state(TASK_INTERRUPTIBLE); - vcpu_put(vcpu); - schedule(); - vcpu_load(vcpu); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&vcpu->wq, &wait); -} - -void kvm_resched(struct kvm_vcpu *vcpu) -{ - if (!need_resched()) - return; - cond_resched(); -} -EXPORT_SYMBOL_GPL(kvm_resched); - -static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct kvm_vcpu *vcpu = vma->vm_file->private_data; - struct page *page; - - if (vmf->pgoff == 0) - page = virt_to_page(vcpu->run); - else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) - page = virt_to_page(vcpu->arch.pio_data); - else - return VM_FAULT_SIGBUS; - get_page(page); - vmf->page = page; - return 0; -} - -static struct vm_operations_struct kvm_vcpu_vm_ops = { - .fault = kvm_vcpu_fault, -}; - -static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vcpu_vm_ops; - return 0; -} - -static int kvm_vcpu_release(struct inode *inode, struct file *filp) -{ - struct kvm_vcpu *vcpu = filp->private_data; - - fput(vcpu->kvm->filp); - return 0; -} - -static struct file_operations kvm_vcpu_fops = { - .release = kvm_vcpu_release, - .unlocked_ioctl = kvm_vcpu_ioctl, - .compat_ioctl = kvm_vcpu_ioctl, - .mmap = kvm_vcpu_mmap, -}; - -/* - * Allocates an inode for the vcpu. - */ -static int create_vcpu_fd(struct kvm_vcpu *vcpu) -{ - int fd, r; - struct inode *inode; - struct file *file; - - r = anon_inode_getfd(&fd, &inode, &file, - "kvm-vcpu", &kvm_vcpu_fops, vcpu); - if (r) - return r; - atomic_inc(&vcpu->kvm->filp->f_count); - return fd; -} - -/* - * Creates some virtual cpus. Good luck creating more than one. - */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) -{ - int r; - struct kvm_vcpu *vcpu; - - if (!valid_vcpu(n)) - return -EINVAL; - - vcpu = kvm_arch_vcpu_create(kvm, n); - if (IS_ERR(vcpu)) - return PTR_ERR(vcpu); - - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - - r = kvm_arch_vcpu_setup(vcpu); - if (r) - goto vcpu_destroy; - - mutex_lock(&kvm->lock); - if (kvm->vcpus[n]) { - r = -EEXIST; - mutex_unlock(&kvm->lock); - goto vcpu_destroy; - } - kvm->vcpus[n] = vcpu; - mutex_unlock(&kvm->lock); - - /* Now it's all set up, let userspace reach it */ - r = create_vcpu_fd(vcpu); - if (r < 0) - goto unlink; - return r; - -unlink: - mutex_lock(&kvm->lock); - kvm->vcpus[n] = NULL; - mutex_unlock(&kvm->lock); -vcpu_destroy: - kvm_arch_vcpu_destroy(vcpu); - return r; -} - -static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) -{ - if (sigset) { - sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - vcpu->sigset_active = 1; - vcpu->sigset = *sigset; - } else - vcpu->sigset_active = 0; - return 0; -} - -static long kvm_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r; - - if (vcpu->kvm->mm != current->mm) - return -EIO; - switch (ioctl) { - case KVM_RUN: - r = -EINVAL; - if (arg) - goto out; - r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - break; - case KVM_GET_REGS: { - struct kvm_regs kvm_regs; - - memset(&kvm_regs, 0, sizeof kvm_regs); - r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) - goto out; - r = 0; - break; - } - case KVM_SET_REGS: { - struct kvm_regs kvm_regs; - - r = -EFAULT; - if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) - goto out; - r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_SREGS: { - struct kvm_sregs kvm_sregs; - - memset(&kvm_sregs, 0, sizeof kvm_sregs); - r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) - goto out; - r = 0; - break; - } - case KVM_SET_SREGS: { - struct kvm_sregs kvm_sregs; - - r = -EFAULT; - if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) - goto out; - r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); - if (r) - goto out; - r = 0; - break; - } - case KVM_TRANSLATE: { - struct kvm_translation tr; - - r = -EFAULT; - if (copy_from_user(&tr, argp, sizeof tr)) - goto out; - r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &tr, sizeof tr)) - goto out; - r = 0; - break; - } - case KVM_DEBUG_GUEST: { - struct kvm_debug_guest dbg; - - r = -EFAULT; - if (copy_from_user(&dbg, argp, sizeof dbg)) - goto out; - r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); - if (r) - goto out; - r = 0; - break; - } - case KVM_SET_SIGNAL_MASK: { - struct kvm_signal_mask __user *sigmask_arg = argp; - struct kvm_signal_mask kvm_sigmask; - sigset_t sigset, *p; - - p = NULL; - if (argp) { - r = -EFAULT; - if (copy_from_user(&kvm_sigmask, argp, - sizeof kvm_sigmask)) - goto out; - r = -EINVAL; - if (kvm_sigmask.len != sizeof sigset) - goto out; - r = -EFAULT; - if (copy_from_user(&sigset, sigmask_arg->sigset, - sizeof sigset)) - goto out; - p = &sigset; - } - r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); - break; - } - case KVM_GET_FPU: { - struct kvm_fpu fpu; - - memset(&fpu, 0, sizeof fpu); - r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &fpu, sizeof fpu)) - goto out; - r = 0; - break; - } - case KVM_SET_FPU: { - struct kvm_fpu fpu; - - r = -EFAULT; - if (copy_from_user(&fpu, argp, sizeof fpu)) - goto out; - r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); - if (r) - goto out; - r = 0; - break; - } - default: - r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); - } -out: - return r; -} - -static long kvm_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - int r; - - if (kvm->mm != current->mm) - return -EIO; - switch (ioctl) { - case KVM_CREATE_VCPU: - r = kvm_vm_ioctl_create_vcpu(kvm, arg); - if (r < 0) - goto out; - break; - case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof kvm_userspace_mem)) - goto out; - - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); - if (r) - goto out; - break; - } - case KVM_GET_DIRTY_LOG: { - struct kvm_dirty_log log; - - r = -EFAULT; - if (copy_from_user(&log, argp, sizeof log)) - goto out; - r = kvm_vm_ioctl_get_dirty_log(kvm, &log); - if (r) - goto out; - break; - } - default: - r = kvm_arch_vm_ioctl(filp, ioctl, arg); - } -out: - return r; -} - -static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct kvm *kvm = vma->vm_file->private_data; - struct page *page; - - if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) - return VM_FAULT_SIGBUS; - page = gfn_to_page(kvm, vmf->pgoff); - if (is_error_page(page)) { - kvm_release_page_clean(page); - return VM_FAULT_SIGBUS; - } - vmf->page = page; - return 0; -} - -static struct vm_operations_struct kvm_vm_vm_ops = { - .fault = kvm_vm_fault, -}; - -static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) -{ - vma->vm_ops = &kvm_vm_vm_ops; - return 0; -} - -static struct file_operations kvm_vm_fops = { - .release = kvm_vm_release, - .unlocked_ioctl = kvm_vm_ioctl, - .compat_ioctl = kvm_vm_ioctl, - .mmap = kvm_vm_mmap, -}; - -static int kvm_dev_ioctl_create_vm(void) -{ - int fd, r; - struct inode *inode; - struct file *file; - struct kvm *kvm; - - kvm = kvm_create_vm(); - if (IS_ERR(kvm)) - return PTR_ERR(kvm); - r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); - if (r) { - kvm_destroy_vm(kvm); - return r; - } - - kvm->filp = file; - - return fd; -} - -static long kvm_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r = -EINVAL; - - switch (ioctl) { - case KVM_GET_API_VERSION: - r = -EINVAL; - if (arg) - goto out; - r = KVM_API_VERSION; - break; - case KVM_CREATE_VM: - r = -EINVAL; - if (arg) - goto out; - r = kvm_dev_ioctl_create_vm(); - break; - case KVM_CHECK_EXTENSION: - r = kvm_dev_ioctl_check_extension((long)argp); - break; - case KVM_GET_VCPU_MMAP_SIZE: - r = -EINVAL; - if (arg) - goto out; - r = 2 * PAGE_SIZE; - break; - default: - return kvm_arch_dev_ioctl(filp, ioctl, arg); - } -out: - return r; -} - -static struct file_operations kvm_chardev_ops = { - .unlocked_ioctl = kvm_dev_ioctl, - .compat_ioctl = kvm_dev_ioctl, -}; - -static struct miscdevice kvm_dev = { - KVM_MINOR, - "kvm", - &kvm_chardev_ops, -}; - -static void hardware_enable(void *junk) -{ - int cpu = raw_smp_processor_id(); - - if (cpu_isset(cpu, cpus_hardware_enabled)) - return; - cpu_set(cpu, cpus_hardware_enabled); - kvm_arch_hardware_enable(NULL); -} - -static void hardware_disable(void *junk) -{ - int cpu = raw_smp_processor_id(); - - if (!cpu_isset(cpu, cpus_hardware_enabled)) - return; - cpu_clear(cpu, cpus_hardware_enabled); - decache_vcpus_on_cpu(cpu); - kvm_arch_hardware_disable(NULL); -} - -static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, - void *v) -{ - int cpu = (long)v; - - val &= ~CPU_TASKS_FROZEN; - switch (val) { - case CPU_DYING: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - hardware_disable(NULL); - break; - case CPU_UP_CANCELED: - printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", - cpu); - smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); - break; - case CPU_ONLINE: - printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", - cpu); - smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); - break; - } - return NOTIFY_OK; -} - -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) -{ - if (val == SYS_RESTART) { - /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - */ - printk(KERN_INFO "kvm: exiting hardware virtualization\n"); - on_each_cpu(hardware_disable, NULL, 0, 1); - } - return NOTIFY_OK; -} - -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - -void kvm_io_bus_init(struct kvm_io_bus *bus) -{ - memset(bus, 0, sizeof(*bus)); -} - -void kvm_io_bus_destroy(struct kvm_io_bus *bus) -{ - int i; - - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; - - kvm_iodevice_destructor(pos); - } -} - -struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) -{ - int i; - - for (i = 0; i < bus->dev_count; i++) { - struct kvm_io_device *pos = bus->devs[i]; - - if (pos->in_range(pos, addr)) - return pos; - } - - return NULL; -} - -void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) -{ - BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); - - bus->devs[bus->dev_count++] = dev; -} - -static struct notifier_block kvm_cpu_notifier = { - .notifier_call = kvm_cpu_hotplug, - .priority = 20, /* must be > scheduler priority */ -}; - -static u64 vm_stat_get(void *_offset) -{ - unsigned offset = (long)_offset; - u64 total = 0; - struct kvm *kvm; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - total += *(u32 *)((void *)kvm + offset); - spin_unlock(&kvm_lock); - return total; -} - -DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); - -static u64 vcpu_stat_get(void *_offset) -{ - unsigned offset = (long)_offset; - u64 total = 0; - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - total += *(u32 *)((void *)vcpu + offset); - } - spin_unlock(&kvm_lock); - return total; -} - -DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); - -static struct file_operations *stat_fops[] = { - [KVM_STAT_VCPU] = &vcpu_stat_fops, - [KVM_STAT_VM] = &vm_stat_fops, -}; - -static void kvm_init_debug(void) -{ - struct kvm_stats_debugfs_item *p; - - debugfs_dir = debugfs_create_dir("kvm", NULL); - for (p = debugfs_entries; p->name; ++p) - p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, - (void *)(long)p->offset, - stat_fops[p->kind]); -} - -static void kvm_exit_debug(void) -{ - struct kvm_stats_debugfs_item *p; - - for (p = debugfs_entries; p->name; ++p) - debugfs_remove(p->dentry); - debugfs_remove(debugfs_dir); -} - -static int kvm_suspend(struct sys_device *dev, pm_message_t state) -{ - hardware_disable(NULL); - return 0; -} - -static int kvm_resume(struct sys_device *dev) -{ - hardware_enable(NULL); - return 0; -} - -static struct sysdev_class kvm_sysdev_class = { - .name = "kvm", - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - -static struct sys_device kvm_sysdev = { - .id = 0, - .cls = &kvm_sysdev_class, -}; - -struct page *bad_page; - -static inline -struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) -{ - return container_of(pn, struct kvm_vcpu, preempt_notifier); -} - -static void kvm_sched_in(struct preempt_notifier *pn, int cpu) -{ - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - - kvm_arch_vcpu_load(vcpu, cpu); -} - -static void kvm_sched_out(struct preempt_notifier *pn, - struct task_struct *next) -{ - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - - kvm_arch_vcpu_put(vcpu); -} - -int kvm_init(void *opaque, unsigned int vcpu_size, - struct module *module) -{ - int r; - int cpu; - - kvm_init_debug(); - - r = kvm_arch_init(opaque); - if (r) - goto out_fail; - - bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (bad_page == NULL) { - r = -ENOMEM; - goto out; - } - - r = kvm_arch_hardware_setup(); - if (r < 0) - goto out_free_0; - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, - kvm_arch_check_processor_compat, - &r, 0, 1); - if (r < 0) - goto out_free_1; - } - - on_each_cpu(hardware_enable, NULL, 0, 1); - r = register_cpu_notifier(&kvm_cpu_notifier); - if (r) - goto out_free_2; - register_reboot_notifier(&kvm_reboot_notifier); - - r = sysdev_class_register(&kvm_sysdev_class); - if (r) - goto out_free_3; - - r = sysdev_register(&kvm_sysdev); - if (r) - goto out_free_4; - - /* A kmem cache lets us meet the alignment requirements of fx_save. */ - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, - __alignof__(struct kvm_vcpu), - 0, NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto out_free_5; - } - - kvm_chardev_ops.owner = module; - - r = misc_register(&kvm_dev); - if (r) { - printk(KERN_ERR "kvm: misc device register failed\n"); - goto out_free; - } - - kvm_preempt_ops.sched_in = kvm_sched_in; - kvm_preempt_ops.sched_out = kvm_sched_out; - - return 0; - -out_free: - kmem_cache_destroy(kvm_vcpu_cache); -out_free_5: - sysdev_unregister(&kvm_sysdev); -out_free_4: - sysdev_class_unregister(&kvm_sysdev_class); -out_free_3: - unregister_reboot_notifier(&kvm_reboot_notifier); - unregister_cpu_notifier(&kvm_cpu_notifier); -out_free_2: - on_each_cpu(hardware_disable, NULL, 0, 1); -out_free_1: - kvm_arch_hardware_unsetup(); -out_free_0: - __free_page(bad_page); -out: - kvm_arch_exit(); - kvm_exit_debug(); -out_fail: - return r; -} -EXPORT_SYMBOL_GPL(kvm_init); - -void kvm_exit(void) -{ - misc_deregister(&kvm_dev); - kmem_cache_destroy(kvm_vcpu_cache); - sysdev_unregister(&kvm_sysdev); - sysdev_class_unregister(&kvm_sysdev_class); - unregister_reboot_notifier(&kvm_reboot_notifier); - unregister_cpu_notifier(&kvm_cpu_notifier); - on_each_cpu(hardware_disable, NULL, 0, 1); - kvm_arch_hardware_unsetup(); - kvm_arch_exit(); - kvm_exit_debug(); - __free_page(bad_page); -} -EXPORT_SYMBOL_GPL(kvm_exit);